/* Halide.h -- interface for the 'Halide' library.

   Copyright (c) 2012-2020 MIT CSAIL, Google, Facebook, Adobe, NVIDIA CORPORATION, and other contributors.
   
   Developed by:
   
     The Halide team
     http://halide-lang.org
   
   Permission is hereby granted, free of charge, to any person obtaining a copy of
   this software and associated documentation files (the "Software"), to deal in
   the Software without restriction, including without limitation the rights to
   use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
   of the Software, and to permit persons to whom the Software is furnished to do
   so, subject to the following conditions:
   
   The above copyright notice and this permission notice shall be included in all
   copies or substantial portions of the Software.
   
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
   
   -----
   
   apps/bgu is Copyright 2016 Google Inc. and is Licensed under the Apache License,
   Version 2.0 (the "License"); you may not use this file except in compliance
   with the License.
   
   Apache License
   
   Version 2.0, January 2004
   
   http://www.apache.org/licenses/
   
   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
   
   1. Definitions.
   
   "License" shall mean the terms and conditions for use, reproduction, and
   distribution as defined by Sections 1 through 9 of this document.
   
   "Licensor" shall mean the copyright owner or entity authorized by the
   copyright owner that is granting the License.
   
   "Legal Entity" shall mean the union of the acting entity and all other
   entities that control, are controlled by, or are under common control with
   that entity. For the purposes of this definition, "control" means (i) the
   power, direct or indirect, to cause the direction or management of such
   entity, whether by contract or otherwise, or (ii) ownership of fifty percent
   (50%) or more of the outstanding shares, or (iii) beneficial ownership of such
   entity.
   
   "You" (or "Your") shall mean an individual or Legal Entity exercising
   permissions granted by this License.
   
   "Source" form shall mean the preferred form for making modifications,
   including but not limited to software source code, documentation source, and
   configuration files.
   
   "Object" form shall mean any form resulting from mechanical transformation or
   translation of a Source form, including but not limited to compiled object
   code, generated documentation, and conversions to other media types.
   
   "Work" shall mean the work of authorship, whether in Source or Object form,
   made available under the License, as indicated by a copyright notice that is
   included in or attached to the work (an example is provided in the Appendix
   below).
   
   "Derivative Works" shall mean any work, whether in Source or Object form, that
   is based on (or derived from) the Work and for which the editorial revisions,
   annotations, elaborations, or other modifications represent, as a whole, an
   original work of authorship. For the purposes of this License, Derivative
   Works shall not include works that remain separable from, or merely link (or
   bind by name) to the interfaces of, the Work and Derivative Works thereof.
   
   "Contribution" shall mean any work of authorship, including the original
   version of the Work and any modifications or additions to that Work or
   Derivative Works thereof, that is intentionally submitted to Licensor for
   inclusion in the Work by the copyright owner or by an individual or Legal
   Entity authorized to submit on behalf of the copyright owner. For the purposes
   of this definition, "submitted" means any form of electronic, verbal, or
   written communication sent to the Licensor or its representatives, including
   but not limited to communication on electronic mailing lists, source code
   control systems, and issue tracking systems that are managed by, or on behalf
   of, the Licensor for the purpose of discussing and improving the Work, but
   excluding communication that is conspicuously marked or otherwise designated
   in writing by the copyright owner as "Not a Contribution."
   
   "Contributor" shall mean Licensor and any individual or Legal Entity on behalf
   of whom a Contribution has been received by Licensor and subsequently
   incorporated within the Work.
   
   2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
   
   3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
   
   4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
   
   (a) You must give any other recipients of the Work or Derivative Works a copy
   of this License; and
   
   (b) You must cause any modified files to carry prominent notices stating that
   You changed the files; and
   
   (c) You must retain, in the Source form of any Derivative Works that You
   distribute, all copyright, patent, trademark, and attribution notices from the
   Source form of the Work, excluding those notices that do not pertain to any
   part of the Derivative Works; and
   
   (d) If the Work includes a "NOTICE" text file as part of its distribution,
   then any Derivative Works that You distribute must include a readable copy of
   the attribution notices contained within such NOTICE file, excluding those
   notices that do not pertain to any part of the Derivative Works, in at least
   one of the following places: within a NOTICE text file distributed as part of
   the Derivative Works; within the Source form or documentation, if provided
   along with the Derivative Works; or, within a display generated by the
   Derivative Works, if and wherever such third-party notices normally appear.
   The contents of the NOTICE file are for informational purposes only and do not
   modify the License. You may add Your own attribution notices within Derivative
   Works that You distribute, alongside or as an addendum to the NOTICE text from
   the Work, provided that such additional attribution notices cannot be
   construed as modifying the License.
   
   You may add Your own copyright statement to Your modifications and may provide
   additional or different license terms and conditions for use, reproduction, or
   distribution of Your modifications, or for any such Derivative Works as a
   whole, provided Your use, reproduction, and distribution of the Work otherwise
   complies with the conditions stated in this License.
   
   5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
   
   6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
   
   7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
   
   8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
   
   9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
   
   END OF TERMS AND CONDITIONS
   
   -----
   
   apps/support/cmdline.h is Copyright (c) 2009, Hideyuki Tanaka and is licensed
   under the BSD 3-Clause license.
   
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
   * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   * Neither the name of the <organization> nor the
   names of its contributors may be used to endorse or promote products
   derived from this software without specific prior written permission.
   
   THIS SOFTWARE IS PROVIDED BY <copyright holder> ''AS IS'' AND ANY
   EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   DISCLAIMED. IN NO EVENT SHALL <copyright holder> BE LIABLE FOR ANY
   DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   
   ----
   
   dependencies/spirv is Copyright (c) 2014-2018 The Khronos Group Inc.
   
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and/or associated documentation files (the "Materials"),
   to deal in the Materials without restriction, including without limitation
   the rights to use, copy, modify, merge, publish, distribute, sublicense,
   and/or sell copies of the Materials, and to permit persons to whom the
   Materials are furnished to do so, subject to the following conditions:
   
   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Materials.
   
   MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
   STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
   HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/
   
   THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
   THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
   FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
   IN THE MATERIALS.
   
   
   ----
   
   dependencies/vulkan is Copyright (c) 2014-2017 The Khronos Group Inc.
   
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   
      http://www.apache.org/licenses/LICENSE-2.0
   
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
   
   ----
   
   apps/linear_algebra/include/cblas.h is licensed under the BLAS license.
   
   The reference BLAS is a freely-available software package. It is available from
   netlib via anonymous ftp and the World Wide Web. Thus, it can be included in
   commercial software packages (and has been). We only ask that proper credit be
   given to the authors.
   
   Like all software, it is copyrighted. It is not trademarked, but we do ask the
   following:
   
   If you modify the source for these routines we ask that you change the name of
   the routine and comment the changes made to the original.
   
   We will gladly answer any questions regarding the software. If a modification is
   done, however, it is the responsibility of the person who modified the routine
   to provide support.
   

*/

#ifndef HALIDE_H
#define HALIDE_H

#ifndef HALIDE_ABSTRACT_GENERATOR_H_
#define HALIDE_ABSTRACT_GENERATOR_H_

#include <functional>
#include <map>
#include <string>
#include <vector>

#ifndef HALIDE_CALLABLE_H
#define HALIDE_CALLABLE_H

/** \file
 *
 * Defines the front-end class representing a jitted, callable Halide pipeline.
 */

#include <array>
#include <map>

#ifndef HALIDE_BUFFER_H
#define HALIDE_BUFFER_H

#ifndef HALIDE_DEVICE_INTERFACE_H
#define HALIDE_DEVICE_INTERFACE_H

/** \file
 * Methods for managing device allocations when jitting
 */

#ifndef HALIDE_EXPR_H
#define HALIDE_EXPR_H

/** \file
 * Base classes for Halide expressions (\ref Halide::Expr) and statements (\ref Halide::Internal::Stmt)
 */

#include <string>
#include <vector>

#ifndef HALIDE_INTRUSIVE_PTR_H
#define HALIDE_INTRUSIVE_PTR_H

/** \file
 *
 * Support classes for reference-counting via intrusive shared
 * pointers.
 */

#include <atomic>
#include <cstdlib>

#ifndef HALIDE_HALIDERUNTIME_H
#define HALIDE_HALIDERUNTIME_H

#ifndef COMPILING_HALIDE_RUNTIME
#ifdef __cplusplus
#include <array>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <string_view>
#else
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#endif
#else
#error "COMPILING_HALIDE_RUNTIME should never be defined for Halide.h"
#endif

// Note that the canonical Halide version is considered to be defined here
// (rather than in the build system); we redundantly define the value in
// our CMake build, so that we ensure that the in-build metadata (eg soversion)
// matches, but keeping the canonical version here makes it easier to keep
// downstream build systems (eg Blaze/Bazel) properly in sync with the source.
#define HALIDE_VERSION_MAJOR 22
#define HALIDE_VERSION_MINOR 0
#define HALIDE_VERSION_PATCH 0

#ifdef __cplusplus
// Forward declare type to allow naming typed handles.
// See Type.h for documentation.
template<typename T>
struct halide_handle_traits;
#endif

#ifdef __cplusplus
extern "C" {
#endif

#ifdef _MSC_VER
// Note that (for MSVC) you should not use "inline" along with HALIDE_ALWAYS_INLINE;
// it is not necessary, and may produce warnings for some build configurations.
#define HALIDE_ALWAYS_INLINE __forceinline
#define HALIDE_NEVER_INLINE __declspec(noinline)
#else
// Note that (for Posixy compilers) you should always use "inline" along with HALIDE_ALWAYS_INLINE;
// otherwise some corner-case scenarios may erroneously report link errors.
#define HALIDE_ALWAYS_INLINE inline __attribute__((always_inline))
#define HALIDE_NEVER_INLINE __attribute__((noinline))
#endif

#ifndef HALIDE_MUST_USE_RESULT
#ifdef __has_attribute
#if __has_attribute(nodiscard)
// C++17 or later
#define HALIDE_MUST_USE_RESULT [[nodiscard]]
#elif __has_attribute(warn_unused_result)
// Clang/GCC
#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result))
#else
#define HALIDE_MUST_USE_RESULT
#endif
#else
#define HALIDE_MUST_USE_RESULT
#endif
#endif

// Annotation for AOT and JIT calls -- if undefined, use no annotation.
// To ensure that all results are checked, do something like
//
//    -DHALIDE_FUNCTION_ATTRS=HALIDE_MUST_USE_RESULT
//
// in your C++ compiler options
#ifndef HALIDE_FUNCTION_ATTRS
#define HALIDE_FUNCTION_ATTRS
#endif

#ifndef HALIDE_EXPORT_SYMBOL
#ifdef _MSC_VER
#define HALIDE_EXPORT_SYMBOL __declspec(dllexport)
#else
#define HALIDE_EXPORT_SYMBOL __attribute__((visibility("default")))
#endif
#endif

#ifndef COMPILING_HALIDE_RUNTIME

// ASAN builds can cause linker errors for Float16, so sniff for that and
// don't enable it by default.
#if defined(__has_feature)
#if __has_feature(address_sanitizer)
#define HALIDE_RUNTIME_ASAN_DETECTED
#endif
#endif

#if defined(__SANITIZE_ADDRESS__) && !defined(HALIDE_RUNTIME_ASAN_DETECTED)
#define HALIDE_RUNTIME_ASAN_DETECTED
#endif

#if !defined(HALIDE_RUNTIME_ASAN_DETECTED)

// clang had _Float16 added as a reserved name in clang 8, but
// doesn't actually support it on most platforms until clang 15.
// Ideally there would be a better way to detect if the type
// is supported, even in a compiler independent fashion, but
// coming up with one has proven elusive.
#if defined(__clang__) && (__clang_major__ >= 15) && !defined(__EMSCRIPTEN__) && !defined(__i386__)
#if defined(__is_identifier)
#if !__is_identifier(_Float16)
#define HALIDE_CPP_COMPILER_HAS_FLOAT16
#endif
#endif
#endif

// Similarly, detecting _Float16 for gcc is problematic.
// For now, we say that if >= v12, and compiling on x86 or arm,
// we assume support. This may need revision.
#if defined(__GNUC__) && (__GNUC__ >= 12)
#if defined(__x86_64__) || (defined(__i386__) && (__GNUC__ >= 14) && defined(__SSE2__)) || ((defined(__arm__) || defined(__aarch64__)) && (__GNUC__ >= 13))
#define HALIDE_CPP_COMPILER_HAS_FLOAT16
#endif
#endif

#endif  // !HALIDE_RUNTIME_ASAN_DETECTED

#endif  // !COMPILING_HALIDE_RUNTIME

/** \file
 *
 * This file declares the routines used by Halide internally in its
 * runtime. On platforms that support weak linking, these can be
 * replaced with user-defined versions by defining an extern "C"
 * function with the same name and signature.
 *
 * When doing Just In Time (JIT) compilation members of
 * some_pipeline_or_func.jit_handlers() must be replaced instead. The
 * corresponding methods are documented below.
 *
 * All of these functions take a "void *user_context" parameter as their
 * first argument; if the Halide kernel that calls back to any of these
 * functions has been compiled with the UserContext feature set on its Target,
 * then the value of that pointer passed from the code that calls the
 * Halide kernel is piped through to the function.
 *
 * Some of these are also useful to call when using the default
 * implementation. E.g. halide_shutdown_thread_pool.
 *
 * Note that even on platforms with weak linking, some linker setups
 * may not respect the override you provide. E.g. if the override is
 * in a shared library and the halide object files are linked directly
 * into the output, the builtin versions of the runtime functions will
 * be called. See your linker documentation for more details. On
 * Linux, LD_DYNAMIC_WEAK=1 may help.
 *
 */

// Forward-declare to suppress warnings if compiling as C.
struct halide_buffer_t;

/** Print a message to stderr. Main use is to support tracing
 * functionality, print, and print_when calls. Also called by the default
 * halide_error.  This function can be replaced in JITed code by using
 * halide_custom_print and providing an implementation of halide_print
 * in AOT code. See Func::set_custom_print.
 */
// @{
extern void halide_print(void *user_context, const char *);
extern void halide_default_print(void *user_context, const char *);
typedef void (*halide_print_t)(void *, const char *);
extern halide_print_t halide_set_custom_print(halide_print_t print);
// @}

/** Halide calls this function on runtime errors (for example bounds
 * checking failures). This function can be replaced in JITed code by
 * using Func::set_error_handler, or in AOT code by calling
 * halide_set_error_handler. In AOT code on platforms that support
 * weak linking (i.e. not Windows), you can also override it by simply
 * defining your own halide_error.
 */
// @{
extern void halide_error(void *user_context, const char *);
extern void halide_default_error(void *user_context, const char *);
typedef void (*halide_error_handler_t)(void *, const char *);
extern halide_error_handler_t halide_set_error_handler(halide_error_handler_t handler);
// @}

/** Cross-platform mutex. Must be initialized with zero and implementation
 * must treat zero as an unlocked mutex with no waiters, etc.
 */
struct halide_mutex {
    uintptr_t _private[1];
};

/** Cross platform condition variable. Must be initialized to 0. */
struct halide_cond {
    uintptr_t _private[1];
};

/** A basic set of mutex and condition variable functions, which call
 * platform specific code for mutual exclusion. Equivalent to posix
 * calls. */
//@{
extern void halide_mutex_lock(struct halide_mutex *mutex);
extern void halide_mutex_unlock(struct halide_mutex *mutex);
extern void halide_cond_signal(struct halide_cond *cond);
extern void halide_cond_broadcast(struct halide_cond *cond);
extern void halide_cond_wait(struct halide_cond *cond, struct halide_mutex *mutex);
//@}

/** Functions for constructing/destroying/locking/unlocking arrays of mutexes. */
struct halide_mutex_array;
//@{
extern struct halide_mutex_array *halide_mutex_array_create(uint64_t sz);
extern void halide_mutex_array_destroy(void *user_context, void *array);
extern int halide_mutex_array_lock(struct halide_mutex_array *array, int entry);
extern int halide_mutex_array_unlock(struct halide_mutex_array *array, int entry);
//@}

/** Define halide_do_par_for to replace the default thread pool
 * implementation. halide_shutdown_thread_pool can also be called to
 * release resources used by the default thread pool on platforms
 * where it makes sense. See Func::set_custom_do_task and
 * Func::set_custom_do_par_for. Should return zero if all the jobs
 * return zero, or an arbitrarily chosen return value from one of the
 * jobs otherwise.
 */
//@{
typedef int (*halide_task_t)(void *user_context, int task_number, uint8_t *closure);
extern int halide_do_par_for(void *user_context,
                             halide_task_t task,
                             int min, int size, uint8_t *closure);
extern void halide_shutdown_thread_pool(void);
//@}

/** Set a custom method for performing a parallel for loop. Returns
 * the old do_par_for handler. */
typedef int (*halide_do_par_for_t)(void *, halide_task_t, int, int, uint8_t *);
extern halide_do_par_for_t halide_set_custom_do_par_for(halide_do_par_for_t do_par_for);

/** An opaque struct representing a semaphore. Used by the task system for async tasks. */
struct halide_semaphore_t {
    uint64_t _private[2];
};

/** A struct representing a semaphore and a number of items that must
 * be acquired from it. Used in halide_parallel_task_t below. */
struct halide_semaphore_acquire_t {
    struct halide_semaphore_t *semaphore;
    int count;
};
extern int halide_semaphore_init(struct halide_semaphore_t *, int n);
extern int halide_semaphore_release(struct halide_semaphore_t *, int n);
extern bool halide_semaphore_try_acquire(struct halide_semaphore_t *, int n);
typedef int (*halide_semaphore_init_t)(struct halide_semaphore_t *, int);
typedef int (*halide_semaphore_release_t)(struct halide_semaphore_t *, int);
typedef bool (*halide_semaphore_try_acquire_t)(struct halide_semaphore_t *, int);

/** A task representing a serial for loop evaluated over some range.
 * Note that task_parent is a pass through argument that should be
 * passed to any dependent taks that are invoked using halide_do_parallel_tasks
 * underneath this call. */
typedef int (*halide_loop_task_t)(void *user_context, int min, int extent,
                                  uint8_t *closure, void *task_parent);

/** A parallel task to be passed to halide_do_parallel_tasks. This
 * task may recursively call halide_do_parallel_tasks, and there may
 * be complex dependencies between seemingly unrelated tasks expressed
 * using semaphores. If you are using a custom task system, care must
 * be taken to avoid potential deadlock. This can be done by carefully
 * respecting the static metadata at the end of the task struct.*/
struct halide_parallel_task_t {
    // The function to call. It takes a user context, a min and
    // extent, a closure, and a task system pass through argument.
    halide_loop_task_t fn;

    // The closure to pass it
    uint8_t *closure;

    // The name of the function to be called. For debugging purposes only.
    const char *name;

    // An array of semaphores that must be acquired before the
    // function is called. Must be reacquired for every call made.
    struct halide_semaphore_acquire_t *semaphores;
    int num_semaphores;

    // The entire range the function should be called over. This range
    // may be sliced up and the function called multiple times.
    int min, extent;

    // A parallel task provides several pieces of metadata to prevent
    // unbounded resource usage or deadlock.

    // The first is the minimum number of execution contexts (call
    // stacks or threads) necessary for the function to run to
    // completion. This may be greater than one when there is nested
    // parallelism with internal producer-consumer relationships
    // (calling the function recursively spawns and blocks on parallel
    // sub-tasks that communicate with each other via semaphores). If
    // a parallel runtime calls the function when fewer than this many
    // threads are idle, it may need to create more threads to
    // complete the task, or else risk deadlock due to committing all
    // threads to tasks that cannot complete without more.
    //
    // FIXME: Note that extern stages are assumed to only require a
    // single thread to complete. If the extern stage is itself a
    // Halide pipeline, this may be an underestimate.
    int min_threads;

    // The calls to the function should be in serial order from min to min+extent-1, with only
    // one executing at a time. If false, any order is fine, and
    // concurrency is fine.
    bool serial;
};

/** Enqueue some number of the tasks described above and wait for them
 * to complete. While waiting, the calling threads assists with either
 * the tasks enqueued, or other non-blocking tasks in the task
 * system. Note that task_parent should be NULL for top-level calls
 * and the pass through argument if this call is being made from
 * another task. */
extern int halide_do_parallel_tasks(void *user_context, int num_tasks,
                                    struct halide_parallel_task_t *tasks,
                                    void *task_parent);

/** If you use the default do_par_for, you can still set a custom
 * handler to perform each individual task. Returns the old handler. */
//@{
typedef int (*halide_do_task_t)(void *, halide_task_t, int, uint8_t *);
extern halide_do_task_t halide_set_custom_do_task(halide_do_task_t do_task);
extern int halide_do_task(void *user_context, halide_task_t f, int idx,
                          uint8_t *closure);
//@}

/** The version of do_task called for loop tasks. By default calls the
 * loop task with the same arguments. */
// @{
typedef int (*halide_do_loop_task_t)(void *, halide_loop_task_t, int, int, uint8_t *, void *);
extern halide_do_loop_task_t halide_set_custom_do_loop_task(halide_do_loop_task_t do_task);
extern int halide_do_loop_task(void *user_context, halide_loop_task_t f, int min, int extent,
                               uint8_t *closure, void *task_parent);
//@}

/** Provide an entire custom tasking runtime via function
 * pointers. Note that do_task and semaphore_try_acquire are only ever
 * called by halide_default_do_par_for and
 * halide_default_do_parallel_tasks, so it's only necessary to provide
 * those if you are mixing in the default implementations of
 * do_par_for and do_parallel_tasks. */
// @{
typedef int (*halide_do_parallel_tasks_t)(void *, int, struct halide_parallel_task_t *,
                                          void *task_parent);
extern void halide_set_custom_parallel_runtime(
    halide_do_par_for_t,
    halide_do_task_t,
    halide_do_loop_task_t,
    halide_do_parallel_tasks_t,
    halide_semaphore_init_t,
    halide_semaphore_try_acquire_t,
    halide_semaphore_release_t);
// @}

/** The default versions of the parallel runtime functions. */
// @{
extern int halide_default_do_par_for(void *user_context,
                                     halide_task_t task,
                                     int min, int size, uint8_t *closure);
extern int halide_default_do_parallel_tasks(void *user_context,
                                            int num_tasks,
                                            struct halide_parallel_task_t *tasks,
                                            void *task_parent);
extern int halide_default_do_task(void *user_context, halide_task_t f, int idx,
                                  uint8_t *closure);
extern int halide_default_do_loop_task(void *user_context, halide_loop_task_t f,
                                       int min, int extent,
                                       uint8_t *closure, void *task_parent);
extern int halide_default_semaphore_init(struct halide_semaphore_t *, int n);
extern int halide_default_semaphore_release(struct halide_semaphore_t *, int n);
extern bool halide_default_semaphore_try_acquire(struct halide_semaphore_t *, int n);
// @}

struct halide_thread;

/** Spawn a thread. Returns a handle to the thread for the purposes of
 * joining it. The thread must be joined in order to clean up any
 * resources associated with it. */
extern struct halide_thread *halide_spawn_thread(void (*f)(void *), void *closure);

/** Join a thread. */
extern void halide_join_thread(struct halide_thread *);

/** Get or set the number of threads used by Halide's thread pool. Set returns
 * the old number.
 *
 * n < 0  : error condition
 * n == 0 : use a reasonable system default (typically, number of cpus online).
 * n == 1 : use exactly one thread; this will always enforce serial execution
 * n > 1  : use a pool of exactly n threads.
 *
 * (Note that this is only guaranteed when using the default implementations
 * of halide_do_par_for(); custom implementations may completely ignore values
 * passed to halide_set_num_threads().)
 */
// @{
extern int halide_get_num_threads();
extern int halide_set_num_threads(int n);
// @}

/** Halide calls these functions to allocate and free memory. To
 * replace in AOT code, use the halide_set_custom_malloc and
 * halide_set_custom_free, or (on platforms that support weak
 * linking), simply define these functions yourself. In JIT-compiled
 * code use Func::set_custom_allocator.
 *
 * If you override them, and find yourself wanting to call the default
 * implementation from within your override, use
 * halide_default_malloc/free.
 *
 * Note that halide_malloc must return a pointer aligned to the
 * maximum meaningful alignment for the platform for the purpose of
 * vector loads and stores, *and* with an allocated size that is (at least)
 * an integral multiple of that same alignment. The default implementation
 * uses 32-byte alignment on arm and 64-byte alignment on x86. Additionally,
 * it must be safe to read at least 8 bytes before the start and beyond the end.
 */
//@{
extern void *halide_malloc(void *user_context, size_t x);
extern void halide_free(void *user_context, void *ptr);
extern void *halide_default_malloc(void *user_context, size_t x);
extern void halide_default_free(void *user_context, void *ptr);
typedef void *(*halide_malloc_t)(void *, size_t);
typedef void (*halide_free_t)(void *, void *);
extern halide_malloc_t halide_set_custom_malloc(halide_malloc_t user_malloc);
extern halide_free_t halide_set_custom_free(halide_free_t user_free);
//@}

/** Halide calls these functions to interact with the underlying
 * system runtime functions. To replace in AOT code on platforms that
 * support weak linking, define these functions yourself, or use
 * the halide_set_custom_load_library() and halide_set_custom_get_library_symbol()
 * functions. In JIT-compiled code, use JITSharedRuntime::set_default_handlers().
 *
 * halide_load_library and halide_get_library_symbol are equivalent to
 * dlopen and dlsym. halide_get_symbol(sym) is equivalent to
 * dlsym(RTLD_DEFAULT, sym).
 */
//@{
extern void *halide_get_symbol(const char *name);
extern void *halide_load_library(const char *name);
extern void *halide_get_library_symbol(void *lib, const char *name);
extern void *halide_default_get_symbol(const char *name);
extern void *halide_default_load_library(const char *name);
extern void *halide_default_get_library_symbol(void *lib, const char *name);
typedef void *(*halide_get_symbol_t)(const char *name);
typedef void *(*halide_load_library_t)(const char *name);
typedef void *(*halide_get_library_symbol_t)(void *lib, const char *name);
extern halide_get_symbol_t halide_set_custom_get_symbol(halide_get_symbol_t user_get_symbol);
extern halide_load_library_t halide_set_custom_load_library(halide_load_library_t user_load_library);
extern halide_get_library_symbol_t halide_set_custom_get_library_symbol(halide_get_library_symbol_t user_get_library_symbol);
//@}

/** Called when debug_to_file is used inside %Halide code.  See
 * Func::debug_to_file for how this is called
 *
 * Cannot be replaced in JITted code at present.
 */
extern int32_t halide_debug_to_file(void *user_context, const char *filename,
                                    struct halide_buffer_t *buf);

/** Types in the halide type system. They can be ints, unsigned ints,
 * or floats (of various bit-widths), or a handle (which is always 64-bits).
 * Note that the int/uint/float values do not imply a specific bit width
 * (the bit width is expected to be encoded in a separate value).
 */
typedef enum halide_type_code_t
#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
    : uint8_t
#endif
{
    halide_type_int = 0,     ///< signed integers
    halide_type_uint = 1,    ///< unsigned integers
    halide_type_float = 2,   ///< IEEE floating point numbers
    halide_type_handle = 3,  ///< opaque pointer type (void *)
    halide_type_bfloat = 4,  ///< floating point numbers in the bfloat format
} halide_type_code_t;

// Note that while __attribute__ can go before or after the declaration,
// __declspec apparently is only allowed before.
#ifndef HALIDE_ATTRIBUTE_ALIGN
#ifdef _MSC_VER
#define HALIDE_ATTRIBUTE_ALIGN(x) __declspec(align(x))
#else
#define HALIDE_ATTRIBUTE_ALIGN(x) __attribute__((aligned(x)))
#endif
#endif

/** A runtime tag for a type in the halide type system. Can be ints,
 * unsigned ints, or floats of various bit-widths (the 'bits'
 * field). Can also be vectors of the same (by setting the 'lanes'
 * field to something larger than one). This struct should be
 * exactly 32-bits in size. */
struct halide_type_t {
    /** The basic type code: signed integer, unsigned integer, or floating point. */
#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
    HALIDE_ATTRIBUTE_ALIGN(1)
    halide_type_code_t code;  // halide_type_code_t
#else
    HALIDE_ATTRIBUTE_ALIGN(1)
    uint8_t code;  // halide_type_code_t
#endif

    /** The number of bits of precision of a single scalar value of this type. */
    HALIDE_ATTRIBUTE_ALIGN(1)
    uint8_t bits;

    /** How many elements in a vector. This is 1 for scalar types. */
    HALIDE_ATTRIBUTE_ALIGN(2)
    uint16_t lanes;

#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
    /** Construct a runtime representation of a Halide type from:
     * code: The fundamental type from an enum.
     * bits: The bit size of one element.
     * lanes: The number of vector elements in the type. */
    HALIDE_ALWAYS_INLINE constexpr halide_type_t(halide_type_code_t code, uint8_t bits, uint16_t lanes = 1)
        : code(code), bits(bits), lanes(lanes) {
    }

    /** Default constructor is required e.g. to declare halide_trace_event
     * instances. */
    HALIDE_ALWAYS_INLINE constexpr halide_type_t()
        : code((halide_type_code_t)0), bits(0), lanes(0) {
    }

    HALIDE_ALWAYS_INLINE constexpr halide_type_t with_lanes(uint16_t new_lanes) const {
        return halide_type_t((halide_type_code_t)code, bits, new_lanes);
    }

    HALIDE_ALWAYS_INLINE constexpr halide_type_t element_of() const {
        return with_lanes(1);
    }
    /** Compare two types for equality. */
    HALIDE_ALWAYS_INLINE constexpr bool operator==(const halide_type_t &other) const {
        return as_u32() == other.as_u32();
    }

    HALIDE_ALWAYS_INLINE constexpr bool operator!=(const halide_type_t &other) const {
        return !(*this == other);
    }

    HALIDE_ALWAYS_INLINE constexpr bool operator<(const halide_type_t &other) const {
        return as_u32() < other.as_u32();
    }

    /** Size in bytes for a single element, even if width is not 1, of this type. */
    HALIDE_ALWAYS_INLINE constexpr int bytes() const {
        return (bits + 7) / 8;
    }

    HALIDE_ALWAYS_INLINE constexpr uint32_t as_u32() const {
        // Note that this produces a result that is identical to memcpy'ing 'this'
        // into a u32 (on a little-endian machine, anyway), and at -O1 or greater
        // on Clang, the compiler knows this and optimizes this into a single 32-bit move.
        // (At -O0 it will look awful.)
        return static_cast<uint8_t>(code) |
               (static_cast<uint16_t>(bits) << 8) |
               (static_cast<uint32_t>(lanes) << 16);
    }
#endif
};

#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
static_assert(sizeof(halide_type_t) == sizeof(uint32_t), "size mismatch in halide_type_t");
#endif

enum halide_trace_event_code_t { halide_trace_load = 0,
                                 halide_trace_store = 1,
                                 halide_trace_begin_realization = 2,
                                 halide_trace_end_realization = 3,
                                 halide_trace_produce = 4,
                                 halide_trace_end_produce = 5,
                                 halide_trace_consume = 6,
                                 halide_trace_end_consume = 7,
                                 halide_trace_begin_pipeline = 8,
                                 halide_trace_end_pipeline = 9,
                                 halide_trace_tag = 10 };

struct halide_trace_event_t {
    /** The name of the Func or Pipeline that this event refers to */
    const char *func;

    /** If the event type is a load or a store, this points to the
     * value being loaded or stored. Use the type field to safely cast
     * this to a concrete pointer type and retrieve it. For other
     * events this is null. */
    void *value;

    /** For loads and stores, an array which contains the location
     * being accessed. For vector loads or stores it is an array of
     * vectors of coordinates (the vector dimension is innermost).
     *
     * For realization or production-related events, this will contain
     * the mins and extents of the region being accessed, in the order
     * min0, extent0, min1, extent1, ...
     *
     * For pipeline-related events, this will be null.
     */
    int32_t *coordinates;

    /** For halide_trace_tag, this points to a read-only null-terminated string
     * of arbitrary text. For all other events, this will be null.
     */
    const char *trace_tag;

    /** If the event type is a load or a store, this is the type of
     * the data. Otherwise, the value is meaningless. */
    struct halide_type_t type;

    /** The type of event */
    enum halide_trace_event_code_t event;

    /* The ID of the parent event (see below for an explanation of
     * event ancestry). */
    int32_t parent_id;

    /** If this was a load or store of a Tuple-valued Func, this is
     * which tuple element was accessed. */
    int32_t value_index;

    /** The length of the coordinates array */
    int32_t dimensions;
};

/** Called when Funcs are marked as trace_load, trace_store, or
 * trace_realization. See Func::set_custom_trace. The default
 * implementation either prints events via halide_print, or if
 * HL_TRACE_FILE is defined, dumps the trace to that file in a
 * sequence of trace packets. The header for a trace packet is defined
 * below. If the trace is going to be large, you may want to make the
 * file a named pipe, and then read from that pipe into gzip.
 *
 * halide_trace returns a unique ID which will be passed to future
 * events that "belong" to the earlier event as the parent id. The
 * ownership hierarchy looks like:
 *
 * begin_pipeline
 * +--trace_tag (if any)
 * +--trace_tag (if any)
 * ...
 * +--begin_realization
 * |  +--produce
 * |  |  +--load/store
 * |  |  +--end_produce
 * |  +--consume
 * |  |  +--load
 * |  |  +--end_consume
 * |  +--end_realization
 * +--end_pipeline
 *
 * Threading means that ownership cannot be inferred from the ordering
 * of events. There can be many active realizations of a given
 * function, or many active productions for a single
 * realization. Within a single production, the ordering of events is
 * meaningful.
 *
 * Note that all trace_tag events (if any) will occur just after the begin_pipeline
 * event, but before any begin_realization events. All trace_tags for a given Func
 * will be emitted in the order added.
 */
// @}
extern int32_t halide_trace(void *user_context, const struct halide_trace_event_t *event);
extern int32_t halide_default_trace(void *user_context, const struct halide_trace_event_t *event);
typedef int32_t (*halide_trace_t)(void *user_context, const struct halide_trace_event_t *);
extern halide_trace_t halide_set_custom_trace(halide_trace_t trace);
// @}

/** The header of a packet in a binary trace. All fields are 32-bit. */
struct halide_trace_packet_t {
    /** The total size of this packet in bytes. Always a multiple of
     * four. Equivalently, the number of bytes until the next
     * packet. */
    uint32_t size;

    /** The id of this packet (for the purpose of parent_id). */
    int32_t id;

    /** The remaining fields are equivalent to those in halide_trace_event_t */
    // @{
    struct halide_type_t type;
    enum halide_trace_event_code_t event;
    int32_t parent_id;
    int32_t value_index;
    int32_t dimensions;
    // @}

#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
    /** Get the coordinates array, assuming this packet is laid out in
     * memory as it was written. The coordinates array comes
     * immediately after the packet header. */
    HALIDE_ALWAYS_INLINE const int *coordinates() const {
        return (const int *)(this + 1);
    }

    HALIDE_ALWAYS_INLINE int *coordinates() {
        return (int *)(this + 1);
    }

    /** Get the value, assuming this packet is laid out in memory as
     * it was written. The packet comes immediately after the coordinates
     * array. */
    HALIDE_ALWAYS_INLINE const void *value() const {
        return (const void *)(coordinates() + dimensions);
    }

    HALIDE_ALWAYS_INLINE void *value() {
        return (void *)(coordinates() + dimensions);
    }

    /** Get the func name, assuming this packet is laid out in memory
     * as it was written. It comes after the value. */
    HALIDE_ALWAYS_INLINE const char *func() const {
        return (const char *)value() + type.lanes * type.bytes();
    }

    HALIDE_ALWAYS_INLINE char *func() {
        return (char *)value() + type.lanes * type.bytes();
    }

    /** Get the trace_tag (if any), assuming this packet is laid out in memory
     * as it was written. It comes after the func name. If there is no trace_tag,
     * this will return a pointer to an empty string. */
    HALIDE_ALWAYS_INLINE const char *trace_tag() const {
        const char *f = func();
        // strlen may not be available here
        while (*f++) {
            // nothing
        }
        return f;
    }

    HALIDE_ALWAYS_INLINE char *trace_tag() {
        char *f = func();
        // strlen may not be available here
        while (*f++) {
            // nothing
        }
        return f;
    }
#endif
};

/** Set the file descriptor that Halide should write binary trace
 * events to. If called with 0 as the argument, Halide outputs trace
 * information to stdout in a human-readable format. If never called,
 * Halide checks the for existence of an environment variable called
 * HL_TRACE_FILE and opens that file. If HL_TRACE_FILE is not defined,
 * it outputs trace information to stdout in a human-readable
 * format. */
extern void halide_set_trace_file(int fd);

/** Halide calls this to retrieve the file descriptor to write binary
 * trace events to. The default implementation returns the value set
 * by halide_set_trace_file. Implement it yourself if you wish to use
 * a custom file descriptor per user_context. Return zero from your
 * implementation to tell Halide to print human-readable trace
 * information to stdout. */
extern int halide_get_trace_file(void *user_context);

/** If tracing is writing to a file. This call closes that file
 * (flushing the trace). Returns zero on success. */
extern int halide_shutdown_trace(void);

/** All Halide GPU or device backend implementations provide an
 * interface to be used with halide_device_malloc, etc. This is
 * accessed via the functions below.
 */

/** An opaque struct containing per-GPU API implementations of the
 * device functions. */
struct halide_device_interface_impl_t;

/** Each GPU API provides a halide_device_interface_t struct pointing
 * to the code that manages device allocations. You can access these
 * functions directly from the struct member function pointers, or by
 * calling the functions declared below. Note that the global
 * functions are not available when using Halide as a JIT compiler.
 * If you are using raw halide_buffer_t in that context you must use
 * the function pointers in the device_interface struct.
 *
 * The function pointers below are currently the same for every GPU
 * API; only the impl field varies. These top-level functions do the
 * bookkeeping that is common across all GPU APIs, and then dispatch
 * to more API-specific functions via another set of function pointers
 * hidden inside the impl field.
 */
struct halide_device_interface_t {
    int (*device_malloc)(void *user_context, struct halide_buffer_t *buf,
                         const struct halide_device_interface_t *device_interface);
    int (*device_free)(void *user_context, struct halide_buffer_t *buf);
    int (*device_sync)(void *user_context, struct halide_buffer_t *buf);
    void (*device_release)(void *user_context,
                           const struct halide_device_interface_t *device_interface);
    int (*copy_to_host)(void *user_context, struct halide_buffer_t *buf);
    int (*copy_to_device)(void *user_context, struct halide_buffer_t *buf,
                          const struct halide_device_interface_t *device_interface);
    int (*device_and_host_malloc)(void *user_context, struct halide_buffer_t *buf,
                                  const struct halide_device_interface_t *device_interface);
    int (*device_and_host_free)(void *user_context, struct halide_buffer_t *buf);
    int (*buffer_copy)(void *user_context, struct halide_buffer_t *src,
                       const struct halide_device_interface_t *dst_device_interface, struct halide_buffer_t *dst);
    int (*device_crop)(void *user_context, const struct halide_buffer_t *src,
                       struct halide_buffer_t *dst);
    int (*device_slice)(void *user_context, const struct halide_buffer_t *src,
                        int slice_dim, int slice_pos, struct halide_buffer_t *dst);
    int (*device_release_crop)(void *user_context, struct halide_buffer_t *buf);
    int (*wrap_native)(void *user_context, struct halide_buffer_t *buf, uint64_t handle,
                       const struct halide_device_interface_t *device_interface);
    int (*detach_native)(void *user_context, struct halide_buffer_t *buf);
    int (*compute_capability)(void *user_context, int *major, int *minor);
    const struct halide_device_interface_impl_t *impl;
};

/** Release all data associated with the given device interface, in
 * particular all resources (memory, texture, context handles)
 * allocated by Halide. Must be called explicitly when using AOT
 * compilation. This is *not* thread-safe with respect to actively
 * running Halide code. Ensure all pipelines are finished before
 * calling this. */
extern void halide_device_release(void *user_context,
                                  const struct halide_device_interface_t *device_interface);

/** Copy image data from device memory to host memory. This must be called
 * explicitly to copy back the results of a GPU-based filter. */
extern int halide_copy_to_host(void *user_context, struct halide_buffer_t *buf);

/** Copy image data from host memory to device memory. This should not
 * be called directly; Halide handles copying to the device
 * automatically.  If interface is NULL and the buf has a non-zero dev
 * field, the device associated with the dev handle will be
 * used. Otherwise if the dev field is 0 and interface is NULL, an
 * error is returned. */
extern int halide_copy_to_device(void *user_context, struct halide_buffer_t *buf,
                                 const struct halide_device_interface_t *device_interface);

/** Copy data from one buffer to another. The buffers may have
 * different shapes and sizes, but the destination buffer's shape must
 * be contained within the source buffer's shape. That is, for each
 * dimension, the min on the destination buffer must be greater than
 * or equal to the min on the source buffer, and min+extent on the
 * destination buffer must be less that or equal to min+extent on the
 * source buffer. The source data is pulled from either device or
 * host memory on the source, depending on the dirty flags. host is
 * preferred if both are valid. The dst_device_interface parameter
 * controls the destination memory space. NULL means host memory. */
extern int halide_buffer_copy(void *user_context, struct halide_buffer_t *src,
                              const struct halide_device_interface_t *dst_device_interface,
                              struct halide_buffer_t *dst);

/** Give the destination buffer a device allocation which is an alias
 * for the same coordinate range in the source buffer. Modifies the
 * device, device_interface, and the device_dirty flag only. Only
 * supported by some device APIs (others will return
 * halide_error_code_device_crop_unsupported). Call
 * halide_device_release_crop instead of halide_device_free to clean
 * up resources associated with the cropped view. Do not free the
 * device allocation on the source buffer while the destination buffer
 * still lives. Note that the two buffers do not share dirty flags, so
 * care must be taken to update them together as needed. Note that src
 * and dst are required to have the same number of dimensions.
 *
 * Note also that (in theory) device interfaces which support cropping may
 * still not support cropping a crop (instead, create a new crop of the parent
 * buffer); in practice, no known implementation has this limitation, although
 * it is possible that some future implementations may require it. */
extern int halide_device_crop(void *user_context,
                              const struct halide_buffer_t *src,
                              struct halide_buffer_t *dst);

/** Give the destination buffer a device allocation which is an alias
 * for a similar coordinate range in the source buffer, but with one dimension
 * sliced away in the dst. Modifies the device, device_interface, and the
 * device_dirty flag only. Only supported by some device APIs (others will return
 * halide_error_code_device_crop_unsupported). Call
 * halide_device_release_crop instead of halide_device_free to clean
 * up resources associated with the sliced view. Do not free the
 * device allocation on the source buffer while the destination buffer
 * still lives. Note that the two buffers do not share dirty flags, so
 * care must be taken to update them together as needed. Note that the dst buffer
 * must have exactly one fewer dimension than the src buffer, and that slice_dim
 * and slice_pos must be valid within src. */
extern int halide_device_slice(void *user_context,
                               const struct halide_buffer_t *src,
                               int slice_dim, int slice_pos,
                               struct halide_buffer_t *dst);

/** Release any resources associated with a cropped/sliced view of another
 * buffer. */
extern int halide_device_release_crop(void *user_context,
                                      struct halide_buffer_t *buf);

/** Wait for current GPU operations to complete. Calling this explicitly
 * should rarely be necessary, except maybe for profiling. */
extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);

/**
 * Wait for current GPU operations to complete. Calling this explicitly
 * should rarely be necessary, except maybe for profiling.
 * This variation of the synchronizing is useful when a synchronization is desirable
 * without specifying any buffer to synchronize on.
 * Calling this with a null device_interface is always illegal.
 */
extern int halide_device_sync_global(void *user_context,
                                     const struct halide_device_interface_t *device_interface);

/** Allocate device memory to back a halide_buffer_t. */
extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
                                const struct halide_device_interface_t *device_interface);

/** Free device memory. */
extern int halide_device_free(void *user_context, struct halide_buffer_t *buf);

/** Wrap or detach a native device handle, setting the device field
 * and device_interface field as appropriate for the given GPU
 * API. The meaning of the opaque handle is specific to the device
 * interface, so if you know the device interface in use, call the
 * more specific functions in the runtime headers for your specific
 * device API instead (e.g. HalideRuntimeCuda.h). */
// @{
extern int halide_device_wrap_native(void *user_context,
                                     struct halide_buffer_t *buf,
                                     uint64_t handle,
                                     const struct halide_device_interface_t *device_interface);
extern int halide_device_detach_native(void *user_context, struct halide_buffer_t *buf);
// @}

/** Selects which gpu device to use. 0 is usually the display
 * device. If never called, Halide uses the environment variable
 * HL_GPU_DEVICE. If that variable is unset, Halide uses the last
 * device. Set this to -1 to use the last device. */
extern void halide_set_gpu_device(int n);

/** Halide calls this to get the desired halide gpu device
 * setting. Implement this yourself to use a different gpu device per
 * user_context. The default implementation returns the value set by
 * halide_set_gpu_device, or the environment variable
 * HL_GPU_DEVICE. */
extern int halide_get_gpu_device(void *user_context);

/** Set the soft maximum amount of memory, in bytes, that the LRU
 *  cache will use to memoize Func results.  This is not a strict
 *  maximum in that concurrency and simultaneous use of memoized
 *  reults larger than the cache size can both cause it to
 *  temporariliy be larger than the size specified here.
 */
extern void halide_memoization_cache_set_size(int64_t size);

/** Given a cache key for a memoized result, currently constructed
 *  from the Func name and top-level Func name plus the arguments of
 *  the computation, determine if the result is in the cache and
 *  return it if so. (The internals of the cache key should be
 *  considered opaque by this function.) If this routine returns true,
 *  it is a cache miss. Otherwise, it will return false and the
 *  buffers passed in will be filled, via copying, with memoized
 *  data. The last argument is a list if halide_buffer_t pointers which
 *  represents the outputs of the memoized Func. If the Func does not
 *  return a Tuple, there will only be one halide_buffer_t in the list. The
 *  tuple_count parameters determines the length of the list.
 *
 * The return values are:
 * -1: Signals an error.
 *  0: Success and cache hit.
 *  1: Success and cache miss.
 */
extern int halide_memoization_cache_lookup(void *user_context, const uint8_t *cache_key, int32_t size,
                                           struct halide_buffer_t *realized_bounds,
                                           int32_t tuple_count, struct halide_buffer_t **tuple_buffers);

/** Given a cache key for a memoized result, currently constructed
 *  from the Func name and top-level Func name plus the arguments of
 *  the computation, store the result in the cache for futre access by
 *  halide_memoization_cache_lookup. (The internals of the cache key
 *  should be considered opaque by this function.) Data is copied out
 *  from the inputs and inputs are unmodified. The last argument is a
 *  list if halide_buffer_t pointers which represents the outputs of the
 *  memoized Func. If the Func does not return a Tuple, there will
 *  only be one halide_buffer_t in the list. The tuple_count parameters
 *  determines the length of the list.
 *
 * If there is a memory allocation failure, the store does not store
 * the data into the cache.
 *
 * If has_eviction_key is true, the entry is marked with eviction_key to
 * allow removing the key with halide_memoization_cache_evict.
 */
extern int halide_memoization_cache_store(void *user_context, const uint8_t *cache_key, int32_t size,
                                          struct halide_buffer_t *realized_bounds,
                                          int32_t tuple_count,
                                          struct halide_buffer_t **tuple_buffers,
                                          bool has_eviction_key, uint64_t eviction_key);

/** Evict all cache entries that were tagged with the given
 *  eviction_key in the memoize scheduling directive.
 */
extern void halide_memoization_cache_evict(void *user_context, uint64_t eviction_key);

/** If halide_memoization_cache_lookup succeeds,
 * halide_memoization_cache_release must be called to signal the
 * storage is no longer being used by the caller. It will be passed
 * the host pointer of one the buffers returned by
 * halide_memoization_cache_lookup. That is
 * halide_memoization_cache_release will be called multiple times for
 * the case where halide_memoization_cache_lookup is handling multiple
 * buffers.  (This corresponds to memoizing a Tuple in Halide.) Note
 * that the host pointer must be sufficient to get to all information
 * the release operation needs. The default Halide cache impleemntation
 * accomplishes this by storing extra data before the start of the user
 * modifiable host storage.
 *
 * This call is like free and does not have a failure return.
 */
extern void halide_memoization_cache_release(void *user_context, void *host);

/** Free all memory and resources associated with the memoization cache.
 * Must be called at a time when no other threads are accessing the cache.
 */
extern void halide_memoization_cache_cleanup(void);

/** Verify that a given range of memory has been initialized; only used when Target::MSAN is enabled.
 *
 * The default implementation simply calls the LLVM-provided __msan_check_mem_is_initialized() function.
 *
 * The return value should always be zero.
 */
extern int halide_msan_check_memory_is_initialized(void *user_context, const void *ptr, uint64_t len, const char *name);

/** Verify that the data pointed to by the halide_buffer_t is initialized (but *not* the halide_buffer_t itself),
 * using halide_msan_check_memory_is_initialized() for checking.
 *
 * The default implementation takes pains to only check the active memory ranges
 * (skipping padding), and sorting into ranges to always check the smallest number of
 * ranges, in monotonically increasing memory order.
 *
 * Most client code should never need to replace the default implementation.
 *
 * The return value should always be zero.
 */
extern int halide_msan_check_buffer_is_initialized(void *user_context, struct halide_buffer_t *buffer, const char *buf_name);

/** Annotate that a given range of memory has been initialized;
 * only used when Target::MSAN is enabled.
 *
 * The default implementation simply calls the LLVM-provided __msan_unpoison() function.
 *
 * The return value should always be zero.
 */
extern int halide_msan_annotate_memory_is_initialized(void *user_context, const void *ptr, uint64_t len);

/** Mark the data pointed to by the halide_buffer_t as initialized (but *not* the halide_buffer_t itself),
 * using halide_msan_annotate_memory_is_initialized() for marking.
 *
 * The default implementation takes pains to only mark the active memory ranges
 * (skipping padding), and sorting into ranges to always mark the smallest number of
 * ranges, in monotonically increasing memory order.
 *
 * Most client code should never need to replace the default implementation.
 *
 * The return value should always be zero.
 */
extern int halide_msan_annotate_buffer_is_initialized(void *user_context, struct halide_buffer_t *buffer);
extern void halide_msan_annotate_buffer_is_initialized_as_destructor(void *user_context, void *buffer);

/** The error codes that may be returned by a Halide pipeline. */
enum halide_error_code_t {
    /** There was no error. This is the value returned by Halide on success. */
    halide_error_code_success = 0,

    /** An uncategorized error occurred. Refer to the string passed to halide_error. */
    halide_error_code_generic_error = -1,

    /** A Func was given an explicit bound via Func::bound, but this
     * was not large enough to encompass the region that is used of
     * the Func by the rest of the pipeline. */
    halide_error_code_explicit_bounds_too_small = -2,

    /** The elem_size field of a halide_buffer_t does not match the size in
     * bytes of the type of that ImageParam. Probable type mismatch. */
    halide_error_code_bad_type = -3,

    /** A pipeline would access memory outside of the halide_buffer_t passed
     * in. */
    halide_error_code_access_out_of_bounds = -4,

    /** A halide_buffer_t was given that spans more than 2GB of memory. */
    halide_error_code_buffer_allocation_too_large = -5,

    /** A halide_buffer_t was given with extents that multiply to a number
     * greater than 2^31-1 */
    halide_error_code_buffer_extents_too_large = -6,

    /** Applying explicit constraints on the size of an input or
     * output buffer shrank the size of that buffer below what will be
     * accessed by the pipeline. */
    halide_error_code_constraints_make_required_region_smaller = -7,

    /** A constraint on a size or stride of an input or output buffer
     * was not met by the halide_buffer_t passed in. */
    halide_error_code_constraint_violated = -8,

    /** A scalar parameter passed in was smaller than its minimum
     * declared value. */
    halide_error_code_param_too_small = -9,

    /** A scalar parameter passed in was greater than its minimum
     * declared value. */
    halide_error_code_param_too_large = -10,

    /** A call to halide_malloc returned NULL. */
    halide_error_code_out_of_memory = -11,

    /** A halide_buffer_t pointer passed in was NULL. */
    halide_error_code_buffer_argument_is_null = -12,

    /** debug_to_file failed to open or write to the specified
     * file. */
    halide_error_code_debug_to_file_failed = -13,

    /** The Halide runtime encountered an error while trying to copy
     * from device to host. Turn on -debug in your target string to
     * see more details. */
    halide_error_code_copy_to_host_failed = -14,

    /** The Halide runtime encountered an error while trying to copy
     * from host to device. Turn on -debug in your target string to
     * see more details. */
    halide_error_code_copy_to_device_failed = -15,

    /** The Halide runtime encountered an error while trying to
     * allocate memory on device. Turn on -debug in your target string
     * to see more details. */
    halide_error_code_device_malloc_failed = -16,

    /** The Halide runtime encountered an error while trying to
     * synchronize with a device. Turn on -debug in your target string
     * to see more details. */
    halide_error_code_device_sync_failed = -17,

    /** The Halide runtime encountered an error while trying to free a
     * device allocation. Turn on -debug in your target string to see
     * more details. */
    halide_error_code_device_free_failed = -18,

    /** Buffer has a non-zero device but no device interface, which
     * violates a Halide invariant. */
    halide_error_code_no_device_interface = -19,

    /** This part of the Halide runtime is unimplemented on this platform. */
    halide_error_code_unimplemented = -20,

    /** A runtime symbol could not be loaded. */
    halide_error_code_symbol_not_found = -21,

    /** There is a bug in the Halide compiler. */
    halide_error_code_internal_error = -22,

    /** The Halide runtime encountered an error while trying to launch
     * a GPU kernel. Turn on -debug in your target string to see more
     * details. */
    halide_error_code_device_run_failed = -23,

    /** The Halide runtime encountered a host pointer that violated
     * the alignment set for it by way of a call to
     * set_host_alignment */
    halide_error_code_unaligned_host_ptr = -24,

    /** A fold_storage directive was used on a dimension that is not
     * accessed in a monotonically increasing or decreasing fashion. */
    halide_error_code_bad_fold = -25,

    /** A fold_storage directive was used with a fold factor that was
     * too small to store all the values of a producer needed by the
     * consumer. */
    halide_error_code_fold_factor_too_small = -26,

    /** User-specified require() expression was not satisfied. */
    halide_error_code_requirement_failed = -27,

    /** At least one of the buffer's extents are negative. */
    halide_error_code_buffer_extents_negative = -28,

    /** Call(s) to a GPU backend API failed. */
    halide_error_code_gpu_device_error = -29,

    /** Failure recording trace packets for one of the halide_target_feature_trace features. */
    halide_error_code_trace_failed = -30,

    /** A specialize_fail() schedule branch was selected at runtime. */
    halide_error_code_specialize_fail = -31,

    /** The Halide runtime encountered an error while trying to wrap a
     * native device handle.  Turn on -debug in your target string to
     * see more details. */
    halide_error_code_device_wrap_native_failed = -32,

    /** The Halide runtime encountered an error while trying to detach
     * a native device handle.  Turn on -debug in your target string
     * to see more details. */
    halide_error_code_device_detach_native_failed = -33,

    /** The host field on an input or output was null, the device
     * field was not zero, and the pipeline tries to use the buffer on
     * the host. You may be passing a GPU-only buffer to a pipeline
     * which is scheduled to use it on the CPU. */
    halide_error_code_host_is_null = -34,

    /** A folded buffer was passed to an extern stage, but the region
     * touched wraps around the fold boundary. */
    halide_error_code_bad_extern_fold = -35,

    /** Buffer has a non-null device_interface but device is 0, which
     * violates a Halide invariant. */
    halide_error_code_device_interface_no_device = -36,

    /** Buffer has both host and device dirty bits set, which violates
     * a Halide invariant. */
    halide_error_code_host_and_device_dirty = -37,

    /** The halide_buffer_t * passed to a halide runtime routine is
     * nullptr and this is not allowed. */
    halide_error_code_buffer_is_null = -38,

    /** The Halide runtime encountered an error while trying to copy
     * from one buffer to another. Turn on -debug in your target
     * string to see more details. */
    halide_error_code_device_buffer_copy_failed = -39,

    /** Attempted to make cropped/sliced alias of a buffer with a device
     * field, but the device_interface does not support cropping. */
    halide_error_code_device_crop_unsupported = -40,

    /** Cropping/slicing a buffer failed for some other reason. Turn on -debug
     * in your target string. */
    halide_error_code_device_crop_failed = -41,

    /** An operation on a buffer required an allocation on a
     * particular device interface, but a device allocation already
     * existed on a different device interface. Free the old one
     * first. */
    halide_error_code_incompatible_device_interface = -42,

    /** The dimensions field of a halide_buffer_t does not match the dimensions of that ImageParam. */
    halide_error_code_bad_dimensions = -43,

    /** A buffer with the device_dirty flag set was passed to a
     * pipeline compiled with no device backends enabled, so it
     * doesn't know how to copy the data back from device memory to
     * host memory. Either call copy_to_host before calling the Halide
     * pipeline, or enable the appropriate device backend. */
    halide_error_code_device_dirty_with_no_device_support = -44,

    /** An explicit storage bound provided is too small to store
     * all the values produced by the function. */
    halide_error_code_storage_bound_too_small = -45,

    /** A factor used to split a loop was discovered to be zero or negative at
     * runtime. */
    halide_error_code_split_factor_not_positive = -46,

    /** "vscale" value of Scalable Vector detected in runtime does not match
     * the vscale value used in compilation. */
    halide_error_code_vscale_invalid = -47,

    /** Profiling failed for a pipeline invocation. */
    halide_error_code_cannot_profile_pipeline = -48,
};

/** Halide calls the functions below on various error conditions. The
 * default implementations construct an error message, call
 * halide_error, then return the matching error code above. On
 * platforms that support weak linking, you can override these to
 * catch the errors individually. */

/** A call into an extern stage for the purposes of bounds inference
 * failed. Returns the error code given by the extern stage. */
extern int halide_error_bounds_inference_call_failed(void *user_context, const char *extern_stage_name, int result);

/** A call to an extern stage failed. Returned the error code given by
 * the extern stage. */
extern int halide_error_extern_stage_failed(void *user_context, const char *extern_stage_name, int result);

/** Various other error conditions. See the enum above for a
 * description of each. */
// @{
extern int halide_error_explicit_bounds_too_small(void *user_context, const char *func_name, const char *var_name,
                                                  int min_bound, int max_bound, int min_required, int max_required);
extern int halide_error_bad_type(void *user_context, const char *func_name,
                                 uint32_t type_given, uint32_t correct_type);  // N.B. The last two args are the bit representation of a halide_type_t
extern int halide_error_bad_dimensions(void *user_context, const char *func_name,
                                       int32_t dimensions_given, int32_t correct_dimensions);
extern int halide_error_access_out_of_bounds(void *user_context, const char *func_name,
                                             int dimension, int min_touched, int max_touched,
                                             int min_valid, int max_valid);
extern int halide_error_buffer_allocation_too_large(void *user_context, const char *buffer_name,
                                                    uint64_t allocation_size, uint64_t max_size);
extern int halide_error_buffer_extents_negative(void *user_context, const char *buffer_name, int dimension, int extent);
extern int halide_error_buffer_extents_too_large(void *user_context, const char *buffer_name,
                                                 int64_t actual_size, int64_t max_size);
extern int halide_error_constraints_make_required_region_smaller(void *user_context, const char *buffer_name,
                                                                 int dimension,
                                                                 int constrained_min, int constrained_extent,
                                                                 int required_min, int required_extent);
extern int halide_error_constraint_violated(void *user_context, const char *var, int val,
                                            const char *constrained_var, int constrained_val);
extern int halide_error_param_too_small_i64(void *user_context, const char *param_name,
                                            int64_t val, int64_t min_val);
extern int halide_error_param_too_small_u64(void *user_context, const char *param_name,
                                            uint64_t val, uint64_t min_val);
extern int halide_error_param_too_small_f64(void *user_context, const char *param_name,
                                            double val, double min_val);
extern int halide_error_param_too_large_i64(void *user_context, const char *param_name,
                                            int64_t val, int64_t max_val);
extern int halide_error_param_too_large_u64(void *user_context, const char *param_name,
                                            uint64_t val, uint64_t max_val);
extern int halide_error_param_too_large_f64(void *user_context, const char *param_name,
                                            double val, double max_val);
extern int halide_error_out_of_memory(void *user_context);
extern int halide_error_buffer_argument_is_null(void *user_context, const char *buffer_name);
extern int halide_error_debug_to_file_failed(void *user_context, const char *func,
                                             const char *filename, int error_code);
extern int halide_error_unaligned_host_ptr(void *user_context, const char *func_name, int alignment);
extern int halide_error_host_is_null(void *user_context, const char *func_name);
extern int halide_error_bad_fold(void *user_context, const char *func_name, const char *var_name,
                                 const char *loop_name);
extern int halide_error_bad_extern_fold(void *user_context, const char *func_name,
                                        int dim, int min, int extent, int valid_min, int fold_factor);

extern int halide_error_fold_factor_too_small(void *user_context, const char *func_name, const char *var_name,
                                              int fold_factor, const char *loop_name, int required_extent);
extern int halide_error_requirement_failed(void *user_context, const char *condition, const char *message);
extern int halide_error_specialize_fail(void *user_context, const char *message);
extern int halide_error_no_device_interface(void *user_context);
extern int halide_error_device_interface_no_device(void *user_context);
extern int halide_error_host_and_device_dirty(void *user_context);
extern int halide_error_buffer_is_null(void *user_context, const char *routine);
extern int halide_error_device_dirty_with_no_device_support(void *user_context, const char *buffer_name);
extern int halide_error_storage_bound_too_small(void *user_context, const char *func_name, const char *var_name,
                                                int provided_size, int required_size);
extern int halide_error_device_crop_failed(void *user_context);
extern int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor);
extern int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale);
// @}

/** Optional features a compilation Target can have.
 *
 * Be sure to keep this in sync with:
 *  1. the Feature enum in Target.h,
 *  2. the implementation of get_runtime_compatible_target in Target.cpp,
 *  3. PyEnums.cpp,
 * if you add a new feature.
 */
typedef enum halide_target_feature_t {
    halide_target_feature_jit = 0,            ///< Generate code that will run immediately inside the calling process.
    halide_target_feature_debug,              ///< Turn on debug info and output for runtime code.
    halide_target_feature_enable_backtraces,  ///< Preserve frame pointers and include unwind tables to support accurate backtraces for debugging and profiling.
    halide_target_feature_no_asserts,         ///< Disable all runtime checks, for slightly tighter code.
    halide_target_feature_no_bounds_query,    ///< Disable the bounds querying functionality.

    halide_target_feature_sse41,    ///< Use SSE 4.1 and earlier instructions. Only relevant on x86.
    halide_target_feature_avx,      ///< Use AVX 1 instructions. Only relevant on x86.
    halide_target_feature_avx2,     ///< Use AVX 2 instructions. Only relevant on x86.
    halide_target_feature_avxvnni,  ///< Enable the AVX-VNNI features supported by AVX2 instructions. Supports 256-bit VNNI instructions without EVEX encoding.
    halide_target_feature_fma,      ///< Enable x86 FMA instruction
    halide_target_feature_fma4,     ///< Enable x86 (AMD) FMA4 instruction set
    halide_target_feature_f16c,     ///< Enable x86 16-bit float support

    halide_target_feature_armv7s,   ///< Generate code for ARMv7s. Only relevant for 32-bit ARM.
    halide_target_feature_no_neon,  ///< Avoid using NEON instructions. Only relevant for 32-bit ARM.

    halide_target_feature_vsx,              ///< Use VSX instructions. Only relevant on POWERPC.
    halide_target_feature_power_arch_2_07,  ///< Use POWER ISA 2.07 new instructions. Only relevant on POWERPC.

    halide_target_feature_cuda,               ///< Enable the CUDA runtime. Defaults to compute capability 2.0 (Fermi)
    halide_target_feature_cuda_capability30,  ///< Enable CUDA compute capability 3.0 (Kepler)
    halide_target_feature_cuda_capability32,  ///< Enable CUDA compute capability 3.2 (Tegra K1)
    halide_target_feature_cuda_capability35,  ///< Enable CUDA compute capability 3.5 (Kepler)
    halide_target_feature_cuda_capability50,  ///< Enable CUDA compute capability 5.0 (Maxwell)
    halide_target_feature_cuda_capability61,  ///< Enable CUDA compute capability 6.1 (Pascal)
    halide_target_feature_cuda_capability70,  ///< Enable CUDA compute capability 7.0 (Volta)
    halide_target_feature_cuda_capability75,  ///< Enable CUDA compute capability 7.5 (Turing)
    halide_target_feature_cuda_capability80,  ///< Enable CUDA compute capability 8.0 (Ampere)
    halide_target_feature_cuda_capability86,  ///< Enable CUDA compute capability 8.6 (Ampere)

    halide_target_feature_opencl,       ///< Enable the OpenCL runtime.
    halide_target_feature_cl_doubles,   ///< Enable double support on OpenCL targets
    halide_target_feature_cl_atomic64,  ///< Enable 64-bit atomics operations on OpenCL targets

    halide_target_feature_user_context,  ///< Generated code takes a user_context pointer as first argument

    halide_target_feature_profile,     ///< Launch a sampling profiler alongside the Halide pipeline that monitors and reports the runtime used by each Func
    halide_target_feature_no_runtime,  ///< Do not include a copy of the Halide runtime in any generated object file or assembly

    halide_target_feature_metal,  ///< Enable the (Apple) Metal runtime.

    halide_target_feature_c_plus_plus_mangling,  ///< Generate C++ mangled names for result function, et al

    halide_target_feature_large_buffers,  ///< Enable 64-bit buffer indexing to support buffers > 2GB. Ignored if bits != 64.

    halide_target_feature_hvx_128,                ///< Enable HVX 128 byte mode.
    halide_target_feature_hvx_v62,                ///< Enable Hexagon v62 architecture.
    halide_target_feature_fuzz_float_stores,      ///< On every floating point store, set the last bit of the mantissa to zero. Pipelines for which the output is very different with this feature enabled may also produce very different output on different processors.
    halide_target_feature_soft_float_abi,         ///< Enable soft float ABI. This only enables the soft float ABI calling convention, which does not necessarily use soft floats.
    halide_target_feature_msan,                   ///< Enable hooks for MSAN support.
    halide_target_feature_avx512,                 ///< Enable the base AVX512 subset supported by all AVX512 architectures. The specific feature sets are AVX-512F and AVX512-CD. See https://en.wikipedia.org/wiki/AVX-512 for a description of each AVX subset.
    halide_target_feature_avx512_knl,             ///< Enable the AVX512 features supported by Knight's Landing chips, such as the Xeon Phi x200. This includes the base AVX512 set, and also AVX512-CD and AVX512-ER.
    halide_target_feature_avx512_skylake,         ///< Enable the AVX512 features supported by Skylake Xeon server processors. This adds AVX512-VL, AVX512-BW, and AVX512-DQ to the base set. The main difference from the base AVX512 set is better support for small integer ops. Note that this does not include the Knight's Landing features. Note also that these features are not available on Skylake desktop and mobile processors.
    halide_target_feature_avx512_cannonlake,      ///< Enable the AVX512 features expected to be supported by future Cannonlake processors. This includes all of the Skylake features, plus AVX512-IFMA and AVX512-VBMI.
    halide_target_feature_avx512_zen4,            ///< Enable the AVX512 features supported by Zen4 processors. This include all of the Cannonlake features, plus AVX512-VNNI, AVX512-BF16, and more.
    halide_target_feature_avx512_zen5,            ///< Enable the AVX512 features supported by Zen5 processors. This include all of the Cannonlake features, plus AVX512-VNNI, AVX512-BF16, AVX-VNNI and more.
    halide_target_feature_avx512_sapphirerapids,  ///< Enable the AVX512 features supported by Sapphire Rapids processors. This include all of the Zen4 features, plus AVX-VNNI and AMX instructions.
    halide_target_feature_trace_loads,            ///< Trace all loads done by the pipeline. Equivalent to calling Func::trace_loads on every non-inlined Func.
    halide_target_feature_trace_stores,           ///< Trace all stores done by the pipeline. Equivalent to calling Func::trace_stores on every non-inlined Func.
    halide_target_feature_trace_realizations,     ///< Trace all realizations done by the pipeline. Equivalent to calling Func::trace_realizations on every non-inlined Func.
    halide_target_feature_trace_pipeline,         ///< Trace the pipeline.
    halide_target_feature_hvx_v65,                ///< Enable Hexagon v65 architecture.
    halide_target_feature_hvx_v66,                ///< Enable Hexagon v66 architecture.
    halide_target_feature_hvx_v68,                ///< Enable Hexagon v68 architecture.
    halide_target_feature_cl_half,                ///< Enable half support on OpenCL targets
    halide_target_feature_strict_float,           ///< Turn off all non-IEEE floating-point optimization. Currently applies only to LLVM targets.
    halide_target_feature_tsan,                   ///< Enable hooks for TSAN support.
    halide_target_feature_asan,                   ///< Enable hooks for ASAN support.
    halide_target_feature_d3d12compute,           ///< Enable Direct3D 12 Compute runtime.
    halide_target_feature_check_unsafe_promises,  ///< Insert assertions for promises.
    halide_target_feature_hexagon_dma,            ///< Enable Hexagon DMA buffers.
    halide_target_feature_embed_bitcode,          ///< Emulate clang -fembed-bitcode flag.
    halide_target_feature_enable_llvm_loop_opt,   ///< Enable loop vectorization + unrolling in LLVM. Overrides halide_target_feature_disable_llvm_loop_opt. (Ignored for non-LLVM targets.)
    halide_target_feature_wasm_mvponly,           ///< Disable all extensions to WebAssembly codegen (including +sign-ext and +nontrapping-fptoint, which are on by default).
    halide_target_feature_wasm_simd128,           ///< Enable +simd128 instructions for WebAssembly codegen.
    halide_target_feature_wasm_threads,           ///< Enable use of threads in WebAssembly codegen. Requires the use of a wasm runtime that provides pthread-compatible wrappers (typically, Emscripten with the -pthreads flag). Unsupported under WASI.
    halide_target_feature_wasm_bulk_memory,       ///< Enable +bulk-memory instructions for WebAssembly codegen.
    halide_target_feature_webgpu,                 ///< Enable the WebGPU runtime.
    halide_target_feature_sve,                    ///< Enable ARM Scalable Vector Extensions
    halide_target_feature_sve2,                   ///< Enable ARM Scalable Vector Extensions v2
    halide_target_feature_egl,                    ///< Force use of EGL support.
    halide_target_feature_arm_dot_prod,           ///< Enable ARMv8.2-a dotprod extension (i.e. udot and sdot instructions)
    halide_target_feature_arm_fp16,               ///< Enable ARMv8.2-a half-precision floating point data processing
    halide_llvm_large_code_model,                 ///< Use the LLVM large code model to compile
    halide_target_feature_rvv,                    ///< Enable RISCV "V" Vector Extension
    halide_target_feature_armv8a,                 ///< Enable ARMv8a instructions
    halide_target_feature_armv81a,                ///< Enable ARMv8.1a instructions
    halide_target_feature_armv82a,                ///< Enable ARMv8.2a instructions
    halide_target_feature_armv83a,                ///< Enable ARMv8.3a instructions
    halide_target_feature_armv84a,                ///< Enable ARMv8.4a instructions
    halide_target_feature_armv85a,                ///< Enable ARMv8.5a instructions
    halide_target_feature_armv86a,                ///< Enable ARMv8.6a instructions
    halide_target_feature_armv87a,                ///< Enable ARMv8.7a instructions
    halide_target_feature_armv88a,                ///< Enable ARMv8.8a instructions
    halide_target_feature_armv89a,                ///< Enable ARMv8.9a instructions
    halide_target_feature_sanitizer_coverage,     ///< Enable hooks for SanitizerCoverage support.
    halide_target_feature_profile_by_timer,       ///< Alternative to halide_target_feature_profile using timer interrupt for systems without threads or applicartions that need to avoid them.
    halide_target_feature_spirv,                  ///< Enable SPIR-V code generation support.
    halide_target_feature_vulkan,                 ///< Enable Vulkan runtime support.
    halide_target_feature_vulkan_int8,            ///< Enable Vulkan 8-bit integer support.
    halide_target_feature_vulkan_int16,           ///< Enable Vulkan 16-bit integer support.
    halide_target_feature_vulkan_int64,           ///< Enable Vulkan 64-bit integer support.
    halide_target_feature_vulkan_float16,         ///< Enable Vulkan 16-bit float support.
    halide_target_feature_vulkan_float64,         ///< Enable Vulkan 64-bit float support.
    halide_target_feature_vulkan_version10,       ///< Enable Vulkan v1.0 runtime target support.
    halide_target_feature_vulkan_version12,       ///< Enable Vulkan v1.2 runtime target support.
    halide_target_feature_vulkan_version13,       ///< Enable Vulkan v1.3 runtime target support.
    halide_target_feature_semihosting,            ///< Used together with Target::NoOS for the baremetal target built with semihosting library and run with semihosting mode where minimum I/O communication with a host PC is available.
    halide_target_feature_avx10_1,                ///< Intel AVX10 version 1 support. vector_bits is used to indicate width.
    halide_target_feature_x86_apx,                ///< Intel x86 APX support. Covers initial set of features released as APX: egpr,push2pop2,ppx,ndd .
    halide_target_feature_simulator,              ///< Target is for a simulator environment. Currently only applies to iOS.
    halide_target_feature_end                     ///< A sentinel. Every target is considered to have this feature, and setting this feature does nothing.
} halide_target_feature_t;

/** This function is called internally by Halide in some situations to determine
 * if the current execution environment can support the given set of
 * halide_target_feature_t flags. The implementation must do the following:
 *
 * -- If there are flags set in features that the function knows *cannot* be supported, return 0.
 * -- Otherwise, return 1.
 * -- Note that any flags set in features that the function doesn't know how to test should be ignored;
 * this implies that a return value of 1 means "not known to be bad" rather than "known to be good".
 *
 * In other words: a return value of 0 means "It is not safe to use code compiled with these features",
 * while a return value of 1 means "It is not obviously unsafe to use code compiled with these features".
 *
 * The default implementation simply calls halide_default_can_use_target_features.
 *
 * Note that `features` points to an array of `count` uint64_t; this array must contain enough
 * bits to represent all the currently known features. Any excess bits must be set to zero.
 */
// @{
extern int halide_can_use_target_features(int count, const uint64_t *features);
typedef int (*halide_can_use_target_features_t)(int count, const uint64_t *features);
extern halide_can_use_target_features_t halide_set_custom_can_use_target_features(halide_can_use_target_features_t);
// @}

/**
 * This is the default implementation of halide_can_use_target_features; it is provided
 * for convenience of user code that may wish to extend halide_can_use_target_features
 * but continue providing existing support, e.g.
 *
 *     int halide_can_use_target_features(int count, const uint64_t *features) {
 *          if (features[halide_target_somefeature >> 6] & (1LL << (halide_target_somefeature & 63))) {
 *              if (!can_use_somefeature()) {
 *                  return 0;
 *              }
 *          }
 *          return halide_default_can_use_target_features(count, features);
 *     }
 */
extern int halide_default_can_use_target_features(int count, const uint64_t *features);

typedef struct halide_dimension_t {
#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
    int32_t min = 0, extent = 0, stride = 0;

    // Per-dimension flags. None are defined yet (This is reserved for future use).
    uint32_t flags = 0;

    HALIDE_ALWAYS_INLINE halide_dimension_t() = default;
    HALIDE_ALWAYS_INLINE halide_dimension_t(int32_t m, int32_t e, int32_t s, uint32_t f = 0)
        : min(m), extent(e), stride(s), flags(f) {
    }

    HALIDE_ALWAYS_INLINE bool operator==(const halide_dimension_t &other) const {
        return (min == other.min) &&
               (extent == other.extent) &&
               (stride == other.stride) &&
               (flags == other.flags);
    }

    HALIDE_ALWAYS_INLINE bool operator!=(const halide_dimension_t &other) const {
        return !(*this == other);
    }
#else
    int32_t min, extent, stride;

    // Per-dimension flags. None are defined yet (This is reserved for future use).
    uint32_t flags;
#endif
} halide_dimension_t;

#ifdef __cplusplus
}  // extern "C"
#endif

#if __cplusplus > 201100L || _MSVC_LANG > 201100L || __STDC_VERSION__ > 202300L
// In C++, an underlying type is required to let the user define their own flag
// values, without those values being undefined behavior when passed around as
// this enum typedef.
#define BUFFER_FLAGS_UNDERLYING_TYPE : uint64_t
#else
#define BUFFER_FLAGS_UNDERLYING_TYPE
#endif
typedef enum BUFFER_FLAGS_UNDERLYING_TYPE {
    halide_buffer_flag_host_dirty = 1,
    halide_buffer_flag_device_dirty = 2
} halide_buffer_flags;
#undef BUFFER_FLAGS_UNDERLYING_TYPE

/**
 * The raw representation of an image passed around by generated
 * Halide code. It includes some stuff to track whether the image is
 * not actually in main memory, but instead on a device (like a
 * GPU). For a more convenient C++ wrapper, use Halide::Buffer<T>. */
typedef struct halide_buffer_t {
    /** A device-handle for e.g. GPU memory used to back this buffer. */
    uint64_t device;

    /** The interface used to interpret the above handle. */
    const struct halide_device_interface_t *device_interface;

    /** A pointer to the start of the data in main memory. In terms of
     * the Halide coordinate system, this is the address of the min
     * coordinates (defined below). */
    uint8_t *host;

    /** flags with various meanings. */
    uint64_t flags;

    /** The type of each buffer element. */
    struct halide_type_t type;

    /** The dimensionality of the buffer. */
    int32_t dimensions;

    /** The shape of the buffer. Halide does not own this array - you
     * must manage the memory for it yourself. */
    halide_dimension_t *dim;

    /** Pads the buffer up to a multiple of 8 bytes */
    void *padding;

#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)
    /** Convenience methods for accessing the flags */
    // @{
    HALIDE_ALWAYS_INLINE bool get_flag(halide_buffer_flags flag) const {
        return (flags & flag) != 0;
    }

    HALIDE_ALWAYS_INLINE void set_flag(halide_buffer_flags flag, bool value) {
        if (value) {
            flags |= flag;
        } else {
            flags &= ~uint64_t(flag);
        }
    }

    HALIDE_MUST_USE_RESULT HALIDE_ALWAYS_INLINE bool host_dirty() const {
        return get_flag(halide_buffer_flag_host_dirty);
    }

    HALIDE_MUST_USE_RESULT HALIDE_ALWAYS_INLINE bool device_dirty() const {
        return get_flag(halide_buffer_flag_device_dirty);
    }

    HALIDE_ALWAYS_INLINE void set_host_dirty(bool v = true) {
        set_flag(halide_buffer_flag_host_dirty, v);
    }

    HALIDE_ALWAYS_INLINE void set_device_dirty(bool v = true) {
        set_flag(halide_buffer_flag_device_dirty, v);
    }
    // @}

    /** The total number of elements this buffer represents. Equal to
     * the product of the extents */
    HALIDE_ALWAYS_INLINE size_t number_of_elements() const {
        size_t s = 1;
        for (int i = 0; i < dimensions; i++) {
            s *= dim[i].extent;
        }
        return s;
    }

    /** Offset to the element with the lowest address.
     * If all strides are positive, equal to zero.
     * Offset is in elements, not bytes.
     * Unlike begin(), this is ok to call on an unallocated buffer. */
    HALIDE_ALWAYS_INLINE ptrdiff_t begin_offset() const {
        ptrdiff_t index = 0;
        for (int i = 0; i < dimensions; i++) {
            const int stride = dim[i].stride;
            if (stride < 0) {
                index += stride * (ptrdiff_t)(dim[i].extent - 1);
            }
        }
        return index;
    }

    /** An offset to one beyond the element with the highest address.
     * Offset is in elements, not bytes.
     * Unlike end(), this is ok to call on an unallocated buffer. */
    HALIDE_ALWAYS_INLINE ptrdiff_t end_offset() const {
        ptrdiff_t index = 0;
        for (int i = 0; i < dimensions; i++) {
            const int stride = dim[i].stride;
            if (stride > 0) {
                index += stride * (ptrdiff_t)(dim[i].extent - 1);
            }
        }
        index += 1;
        return index;
    }

    /** A pointer to the element with the lowest address.
     * If all strides are positive, equal to the host pointer.
     * Illegal to call on an unallocated buffer. */
    HALIDE_ALWAYS_INLINE uint8_t *begin() const {
        return host + begin_offset() * type.bytes();
    }

    /** A pointer to one beyond the element with the highest address.
     * Illegal to call on an unallocated buffer. */
    HALIDE_ALWAYS_INLINE uint8_t *end() const {
        return host + end_offset() * type.bytes();
    }

    /** The total number of bytes spanned by the data in memory. */
    HALIDE_ALWAYS_INLINE size_t size_in_bytes() const {
        return (size_t)(end_offset() - begin_offset()) * type.bytes();
    }

    /** A pointer to the element at the given location. */
    HALIDE_ALWAYS_INLINE uint8_t *address_of(const int *pos) const {
        ptrdiff_t index = 0;
        for (int i = 0; i < dimensions; i++) {
            index += (ptrdiff_t)dim[i].stride * (pos[i] - dim[i].min);
        }
        return host + index * type.bytes();
    }

    /** Attempt to call device_sync for the buffer. If the buffer
     * has no device_interface (or no device_sync), this is a quiet no-op.
     * Calling this explicitly should rarely be necessary, except for profiling. */
    HALIDE_ALWAYS_INLINE int device_sync(void *ctx = nullptr) {
        if (device_interface && device_interface->device_sync) {
            return device_interface->device_sync(ctx, this);
        }
        return 0;
    }

    /** Check if an input buffer passed extern stage is a querying
     * bounds. Compared to doing the host pointer check directly,
     * this both adds clarity to code and will facilitate moving to
     * another representation for bounds query arguments. */
    HALIDE_ALWAYS_INLINE bool is_bounds_query() const {
        return host == nullptr && device == 0;
    }

#endif
} halide_buffer_t;

#ifdef __cplusplus
extern "C" {
#endif

#ifndef HALIDE_ATTRIBUTE_DEPRECATED
#ifdef HALIDE_ALLOW_DEPRECATED
#define HALIDE_ATTRIBUTE_DEPRECATED(x)
#else
#ifdef _MSC_VER
#define HALIDE_ATTRIBUTE_DEPRECATED(x) __declspec(deprecated(x))
#else
#define HALIDE_ATTRIBUTE_DEPRECATED(x) __attribute__((deprecated(x)))
#endif
#endif
#endif

/** halide_scalar_value_t is a simple union able to represent all the well-known
 * scalar values in a filter argument. Note that it isn't tagged with a type;
 * you must ensure you know the proper type before accessing. Most user
 * code will never need to create instances of this struct; its primary use
 * is to hold def/min/max values in a halide_filter_argument_t. (Note that
 * this is conceptually just a union; it's wrapped in a struct to ensure
 * that it doesn't get anonymized by LLVM.)
 */
struct halide_scalar_value_t {
    union {
        bool b;
        int8_t i8;
        int16_t i16;
        int32_t i32;
        int64_t i64;
        uint8_t u8;
        uint16_t u16;
        uint32_t u32;
        uint64_t u64;
        float f32;
        double f64;
        void *handle;
    } u;
#ifdef __cplusplus
    HALIDE_ALWAYS_INLINE halide_scalar_value_t() {
        u.u64 = 0;
    }
#endif
};

enum halide_argument_kind_t {
    halide_argument_kind_input_scalar = 0,
    halide_argument_kind_input_buffer = 1,
    halide_argument_kind_output_buffer = 2
};

/*
    These structs must be robust across different compilers and settings; when
    modifying them, strive for the following rules:

    1) All fields are explicitly sized. I.e. must use int32_t and not "int"
    2) All fields must land on an alignment boundary that is the same as their size
    3) Explicit padding is added to make that so
    4) The sizeof the struct is padded out to a multiple of the largest natural size thing in the struct
    5) don't forget that 32 and 64 bit pointers are different sizes
*/

/**
 * Obsolete version of halide_filter_argument_t; only present in
 * code that wrote halide_filter_metadata_t version 0.
 */
struct halide_filter_argument_t_v0 {
    const char *name;
    int32_t kind;
    int32_t dimensions;
    struct halide_type_t type;
    const struct halide_scalar_value_t *def, *min, *max;
};

/**
 * halide_filter_argument_t is essentially a plain-C-struct equivalent to
 * Halide::Argument; most user code will never need to create one.
 */
struct halide_filter_argument_t {
    const char *name;    // name of the argument; will never be null or empty.
    int32_t kind;        // actually halide_argument_kind_t
    int32_t dimensions;  // always zero for scalar arguments
    struct halide_type_t type;
    // These pointers should always be null for buffer arguments,
    // and *may* be null for scalar arguments. (A null value means
    // there is no def/min/max/estimate specified for this argument.)
    const struct halide_scalar_value_t *scalar_def, *scalar_min, *scalar_max, *scalar_estimate;
    // This pointer should always be null for scalar arguments,
    // and *may* be null for buffer arguments. If not null, it should always
    // point to an array of dimensions*2 pointers, which will be the (min, extent)
    // estimates for each dimension of the buffer. (Note that any of the pointers
    // may be null as well.)
    int64_t const *const *buffer_estimates;
};

struct halide_filter_metadata_t {
#ifdef __cplusplus
    static const int32_t VERSION = 1;
#endif

    /** version of this metadata; currently always 1. */
    int32_t version;

    /** The number of entries in the arguments field. This is always >= 1. */
    int32_t num_arguments;

    /** An array of the filters input and output arguments; this will never be
     * null. The order of arguments is not guaranteed (input and output arguments
     * may come in any order); however, it is guaranteed that all arguments
     * will have a unique name within a given filter. */
    const struct halide_filter_argument_t *arguments;

    /** The Target for which the filter was compiled. This is always
     * a canonical Target string (ie a product of Target::to_string). */
    const char *target;

    /** The function name of the filter. */
    const char *name;
};

/** halide_register_argv_and_metadata() is a **user-defined** function that
 * must be provided in order to use the registration.cc files produced
 * by Generators when the 'registration' output is requested. Each registration.cc
 * file provides a static initializer that calls this function with the given
 * filter's argv-call variant, its metadata, and (optionally) and additional
 * textual data that the build system chooses to tack on for its own purposes.
 * Note that this will be called at static-initializer time (i.e., before
 * main() is called), and in an unpredictable order. Note that extra_key_value_pairs
 * may be nullptr; if it's not null, it's expected to be a null-terminated list
 * of strings, with an even number of entries. */
void halide_register_argv_and_metadata(
    int (*filter_argv_call)(void **),
    const struct halide_filter_metadata_t *filter_metadata,
    const char *const *extra_key_value_pairs);

/** The functions below here are relevant for pipelines compiled with
 * the -profile target flag, which runs a sampling profiler thread
 * alongside the pipeline. */

/** Per-Func state tracked by the sampling profiler. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_func_stats {
    /** Total time taken evaluating this Func (in nanoseconds). */
    uint64_t time;

    /** The current memory allocation of this Func. */
    uint64_t memory_current;

    /** The peak memory allocation of this Func. */
    uint64_t memory_peak;

    /** The total memory allocation of this Func. */
    uint64_t memory_total;

    /** The peak stack allocation of this Func's threads. */
    uint64_t stack_peak;

    /** The average number of thread pool worker threads active while computing this Func. */
    uint64_t active_threads_numerator, active_threads_denominator;

    /** The name of this Func. A global constant string. */
    const char *name;

    /** The total number of memory allocation of this Func. */
    int num_allocs;
};

/** Per-pipeline state tracked by the sampling profiler. These exist
 * in a linked list. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_pipeline_stats {
    /** Total time spent in this pipeline (in nanoseconds) */
    uint64_t time;

    /** The current memory allocation of funcs in this pipeline. */
    uint64_t memory_current;

    /** The peak memory allocation of funcs in this pipeline. */
    uint64_t memory_peak;

    /** The total memory allocation of funcs in this pipeline. */
    uint64_t memory_total;

    /** The average number of thread pool worker threads doing useful
     * work while computing this pipeline. */
    uint64_t active_threads_numerator, active_threads_denominator;

    /** The name of this pipeline. A global constant string. */
    const char *name;

    /** An array containing states for each Func in this pipeline. */
    struct halide_profiler_func_stats *funcs;

    /** The next pipeline_stats pointer. It's a void * because types
     * in the Halide runtime may not currently be recursive. */
    void *next;

    /** The number of funcs in this pipeline. */
    int num_funcs;

    /** The number of times this pipeline has been run. */
    int runs;

    /** The total number of samples taken inside of this pipeline. */
    int samples;

    /** The total number of memory allocation of funcs in this pipeline. */
    int num_allocs;
};

/** Per-invocation-of-a-pipeline state. Lives on the stack of the Halide
 * code. Exists in a doubly-linked list to that it can be cleanly
 * removed. */
struct HALIDE_ATTRIBUTE_ALIGN(8) halide_profiler_instance_state {
    /** Time billed to funcs in this instance by the sampling thread. */
    uint64_t billed_time;

    /** Wall clock time of the start of the instance. */
    uint64_t start_time;

    /** The current memory allocation of funcs in this instance. */
    uint64_t memory_current;

    /** The peak memory allocation of funcs in this instance. */
    uint64_t memory_peak;

    /** The total memory allocation of funcs in this instance. */
    uint64_t memory_total;

    /** The average number of thread pool worker threads doing useful
     * work while computing this instance. */
    uint64_t active_threads_numerator, active_threads_denominator;

    /** A pointer to the next running instance, so that the running instances
     * can exist in a linked list. */
    struct halide_profiler_instance_state *next;

    /** A pointer to the address of the next pointer of the previous instance,
     * so that this can be removed from the linked list when the instance
     * terminates. */
    struct halide_profiler_instance_state **prev_next;

    /** Information shared across all instances. The stats above are merged into
     * it when the instance is retired. */
    struct halide_profiler_pipeline_stats *pipeline_stats;

    /** An array containing states for each Func in this instance of this pipeline. */
    struct halide_profiler_func_stats *funcs;

    /** The id of the current running Func. Set by the pipeline, read
     * periodically by the profiler thread. */
    int current_func;

    /** The number of threads currently doing work on this pipeline instance. */
    int active_threads;

    /** The number of samples taken by this instance. */
    int samples;

    /** The total number of memory allocation of funcs in this instance. */
    int num_allocs;

    /** Whether or not this instance should count towards pipeline
     * statistics. */
    int should_collect_statistics;
};

/** The global state of the profiler. */
struct halide_profiler_state {
    /** Guards access to the fields below. If not locked, the sampling
     * profiler thread is free to modify things below (including
     * reordering the linked list of pipeline stats). */
    struct halide_mutex lock;

    /** A linked list of stats gathered for each pipeline. */
    struct halide_profiler_pipeline_stats *pipelines;

    /** Retrieve remote profiler state. Used so that the sampling
     * profiler can follow along with execution that occurs elsewhere,
     * e.g. on a DSP. If null, it reads from the int above instead. */

    /** Sampling thread reference to be joined at shutdown. */
    struct halide_thread *sampling_thread;

    /** The running instances of Halide pipelines. */
    struct halide_profiler_instance_state *instances;

    /** If this callback is defined, the profiler asserts that there is a single
     * live instance, and then uses it to get the current func and number of
     * active threads insted of reading the fields in the instance. This is used
     * so that the profiler can follow along with execution that occurs
     * elsewhere (e.g. on an accelerator). */
    void (*get_remote_profiler_state)(int *func, int *active_workers);

    /** The amount of time the profiler thread sleeps between samples in
     * microseconds. Defaults to 1000. To change it call
     * halide_profiler_get_state and mutate this field. */
    int sleep_time;

    /** Set to 1 when you want the profiler to wait for all running instances to
     * finish and then stop gracefully. */
    int shutdown;
};

/** Get a pointer to the global profiler state for programmatic
 * inspection. Lock it before using to pause the profiler. */
extern struct halide_profiler_state *halide_profiler_get_state(void);

/** Get a pointer to the pipeline state associated with pipeline_name.
 * This function grabs the global profiler state's lock on entry. */
extern struct halide_profiler_pipeline_stats *halide_profiler_get_pipeline_state(const char *pipeline_name);

/** Collects profiling information. Intended to be called from a timer
 * interrupt handler if timer based profiling is being used.
 *  State argument is acquired via halide_profiler_get_pipeline_state.
 * prev_t argument is the previous time and can be used to set a more
 * accurate time interval if desired. */
extern int halide_profiler_sample(struct halide_profiler_state *s, uint64_t *prev_t);

/** Reset profiler state cheaply. May leave threads running or some memory
 * allocated but all accumulated statistics are reset. Blocks until all running
 * profiled Halide pipelines exit. */
extern void halide_profiler_reset(void);

/** Reset all profiler state. Blocks until all running profiled Halide
 * pipelines exit. */
extern void halide_profiler_shutdown(void);

/** Print out timing statistics for everything run since the last
 * reset. Also happens at process exit. */
extern void halide_profiler_report(void *user_context);

/** These routines are called to temporarily disable and then reenable
 * the profiler. */
//@{
extern void halide_profiler_lock(struct halide_profiler_state *);
extern void halide_profiler_unlock(struct halide_profiler_state *);
//@}

/// \name "Float16" functions
/// These functions operate of bits (``uint16_t``) representing a half
/// precision floating point number (IEEE-754 2008 binary16).
//{@

/** Read bits representing a half precision floating point number and return
 *  the float that represents the same value */
extern float halide_float16_bits_to_float(uint16_t);

/** Read bits representing a half precision floating point number and return
 *  the double that represents the same value */
extern double halide_float16_bits_to_double(uint16_t);

// TODO: Conversion functions to half

//@}

// Allocating and freeing device memory is often very slow. The
// methods below give Halide's runtime permission to hold onto device
// memory to service future requests instead of returning it to the
// underlying device API. The API does not manage an allocation pool,
// all it does is provide access to a shared counter that acts as a
// limit on the unused memory not yet returned to the underlying
// device API. It makes callbacks to participants when memory needs to
// be released because the limit is about to be exceeded (either
// because the limit has been reduced, or because the memory owned by
// some participant becomes unused).

/** Tell Halide whether or not it is permitted to hold onto device
 * allocations to service future requests instead of returning them
 * eagerly to the underlying device API. Many device allocators are
 * quite slow, so it can be beneficial to set this to true. The
 * default value for now is false.
 *
 * Note that if enabled, the eviction policy is very simplistic. The
 * 32 most-recently used allocations are preserved, regardless of
 * their size. Additionally, if a call to cuMalloc results in an
 * out-of-memory error, the entire cache is flushed and the allocation
 * is retried. See https://github.com/halide/Halide/issues/4093
 *
 * If set to false, releases all unused device allocations back to the
 * underlying device APIs. For finer-grained control, see specific
 * methods in each device api runtime.
 *
 * Note that if the flag is set to true, this call *must* succeed and return
 * a value of halide_error_code_success (i.e., zero); if you replace
 * the implementation of this call in the runtime, you must honor this contract.
 * */
extern int halide_reuse_device_allocations(void *user_context, bool);

/** Determines whether on device_free the memory is returned
 * immediately to the device API, or placed on a free list for future
 * use. Override and switch based on the user_context for
 * finer-grained control. By default just returns the value most
 * recently set by the method above. */
extern bool halide_can_reuse_device_allocations(void *user_context);

struct halide_device_allocation_pool {
    int (*release_unused)(void *user_context);
    struct halide_device_allocation_pool *next;
};

/** Register a callback to be informed when
 * halide_reuse_device_allocations(false) is called, and all unused
 * device allocations must be released. The object passed should have
 * global lifetime, and its next field will be clobbered. */
extern void halide_register_device_allocation_pool(struct halide_device_allocation_pool *);

#ifdef __cplusplus
}  // End extern "C"
#endif

#if (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)

namespace {

template<typename T>
struct check_is_pointer {
    static constexpr bool value = false;
};

template<typename T>
struct check_is_pointer<T *> {
    static constexpr bool value = true;
};

}  // namespace

/** Construct the halide equivalent of a C type */
template<typename T>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of() {
    // Create a compile-time error if T is not a pointer (without
    // using any includes - this code goes into the runtime).
    // (Note that we can't have uninitialized variables in constexpr functions,
    // even if those variables aren't used.)
    static_assert(check_is_pointer<T>::value, "Expected a pointer type here");
    return halide_type_t(halide_type_handle, 64);
}

#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<_Float16>() {
    return halide_type_t(halide_type_float, 16);
}
#endif

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<float>() {
    return halide_type_t(halide_type_float, 32);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<double>() {
    return halide_type_t(halide_type_float, 64);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<bool>() {
    return halide_type_t(halide_type_uint, 1);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<uint8_t>() {
    return halide_type_t(halide_type_uint, 8);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<uint16_t>() {
    return halide_type_t(halide_type_uint, 16);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<uint32_t>() {
    return halide_type_t(halide_type_uint, 32);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<uint64_t>() {
    return halide_type_t(halide_type_uint, 64);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<int8_t>() {
    return halide_type_t(halide_type_int, 8);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<int16_t>() {
    return halide_type_t(halide_type_int, 16);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<int32_t>() {
    return halide_type_t(halide_type_int, 32);
}

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<int64_t>() {
    return halide_type_t(halide_type_int, 64);
}

#ifndef COMPILING_HALIDE_RUNTIME

// These structures are used by `function_info_header` files
// (generated by passing `-e function_info_header` to a Generator).
// The generated files contain documentation on the proper usage.
namespace HalideFunctionInfo {

enum ArgumentKind { InputScalar = 0,
                    InputBuffer = 1,
                    OutputBuffer = 2 };

struct ArgumentInfo {
    std::string_view name;
    ArgumentKind kind;
    int32_t dimensions;  // always zero for scalar arguments
    halide_type_t type;
};

}  // namespace HalideFunctionInfo

#endif  // COMPILING_HALIDE_RUNTIME

#endif  // (__cplusplus >= 201103L || _MSVC_LANG >= 201103L)

#endif  // HALIDE_HALIDERUNTIME_H

namespace Halide {
namespace Internal {

/** A class representing a reference count to be used with IntrusivePtr */
class RefCount {
    std::atomic<int> count;

public:
    RefCount() noexcept
        : count(0) {
    }
    int increment() {
        return ++count;
    }  // Increment and return new value
    int decrement() {
        return --count;
    }  // Decrement and return new value
    bool is_const_zero() const {
        return count == 0;
    }
    int atomic_get() const {
        return count;
    }
};

/**
 * Because in this header we don't yet know how client classes store
 * their RefCount (and we don't want to depend on the declarations of
 * the client classes), any class that you want to hold onto via one
 * of these must provide implementations of ref_count and destroy,
 * which we forward-declare here.
 *
 * E.g. if you want to use IntrusivePtr<MyClass>, then you should
 * define something like this in MyClass.cpp (assuming MyClass has
 * a field: mutable RefCount ref_count):
 *
 * template<> RefCount &ref_count<MyClass>(const MyClass *c) noexcept {return c->ref_count;}
 * template<> void destroy<MyClass>(const MyClass *c) {delete c;}
 */
// @{
template<typename T>
RefCount &ref_count(const T *t) noexcept;
template<typename T>
void destroy(const T *t);
// @}

/** Intrusive shared pointers have a reference count (a
 * RefCount object) stored in the class itself. This is perhaps more
 * efficient than storing it externally, but more importantly, it
 * means it's possible to recover a reference-counted handle from the
 * raw pointer, and it's impossible to have two different reference
 * counts attached to the same raw object. Seeing as we pass around
 * raw pointers to concrete IRNodes and Expr's interchangeably, this
 * is a useful property.
 */
template<typename T>
struct IntrusivePtr {
private:
    void incref(T *p) {
        if (p) {
            ref_count(p).increment();
        }
    }

    void decref(T *p) {
        if (p) {
            // Note that if the refcount is already zero, then we're
            // in a recursive destructor due to a self-reference (a
            // cycle), where the ref_count has been adjusted to remove
            // the counts due to the cycle. The next line then makes
            // the ref_count negative, which prevents actually
            // entering the destructor recursively.
            if (ref_count(p).decrement() == 0) {
                destroy(p);
            }
        }
    }

protected:
    T *ptr = nullptr;

public:
    /** Access the raw pointer in a variety of ways.
     * Note that a "const IntrusivePtr<T>" is not the same thing as an
     * IntrusivePtr<const T>. So the methods that return the ptr are
     * const, despite not adding an extra const to T. */
    // @{
    T *get() const {
        return ptr;
    }

    T &operator*() const {
        return *ptr;
    }

    T *operator->() const {
        return ptr;
    }
    // @}

    ~IntrusivePtr() {
        decref(ptr);
    }

    HALIDE_ALWAYS_INLINE
    IntrusivePtr() = default;

    HALIDE_ALWAYS_INLINE
    IntrusivePtr(T *p)
        : ptr(p) {
        incref(ptr);
    }

    HALIDE_ALWAYS_INLINE
    IntrusivePtr(const IntrusivePtr<T> &other) noexcept
        : ptr(other.ptr) {
        incref(ptr);
    }

    HALIDE_ALWAYS_INLINE
    IntrusivePtr(IntrusivePtr<T> &&other) noexcept
        : ptr(other.ptr) {
        other.ptr = nullptr;
    }

    // NOLINTNEXTLINE(bugprone-unhandled-self-assignment)
    IntrusivePtr<T> &operator=(const IntrusivePtr<T> &other) {
        // Same-ptr but different-this happens frequently enough
        // to check for (see https://github.com/halide/Halide/pull/5412)
        if (other.ptr == ptr) {
            return *this;
        }
        // Other can be inside of something owned by this, so we
        // should be careful to incref other before we decref
        // ourselves.
        T *temp = other.ptr;
        incref(temp);
        decref(ptr);
        ptr = temp;
        return *this;
    }

    IntrusivePtr<T> &operator=(IntrusivePtr<T> &&other) noexcept {
        std::swap(ptr, other.ptr);
        return *this;
    }

    /* Handles can be null. This checks that. */
    HALIDE_ALWAYS_INLINE
    bool defined() const {
        return ptr != nullptr;
    }

    /* Check if two handles point to the same ptr. This is
     * equality of reference, not equality of value. */
    HALIDE_ALWAYS_INLINE
    bool same_as(const IntrusivePtr &other) const {
        return ptr == other.ptr;
    }

    HALIDE_ALWAYS_INLINE
    bool operator<(const IntrusivePtr<T> &other) const {
        return ptr < other.ptr;
    }

    HALIDE_ALWAYS_INLINE
    bool is_sole_reference() const {
        return ptr && ref_count(ptr).atomic_get() == 1;
    }
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_TYPE_H
#define HALIDE_TYPE_H

#ifndef HALIDE_ERROR_H
#define HALIDE_ERROR_H

#include <memory>
#include <sstream>
#include <stdexcept>

#ifndef HALIDE_DEBUG_H
#define HALIDE_DEBUG_H

/** \file
 * Defines functions for debug logging during code generation.
 */

#include <cstdlib>
#include <iostream>
#include <string>

namespace Halide {

struct Expr;
struct Type;
// Forward declare some things from IRPrinter, which we can't include yet.
std::ostream &operator<<(std::ostream &stream, const Expr &);
std::ostream &operator<<(std::ostream &stream, const Type &);

class Module;
std::ostream &operator<<(std::ostream &stream, const Module &);

struct Target;
/** Emit a halide Target in a human readable form */
std::ostream &operator<<(std::ostream &stream, const Target &);

namespace Internal {

struct Stmt;
std::ostream &operator<<(std::ostream &stream, const Stmt &);

struct LoweredFunc;
std::ostream &operator<<(std::ostream &, const LoweredFunc &);

bool debug_is_active_impl(int verbosity, const char *file, const char *function, int line);
#define debug_is_active(n) (::Halide::Internal::debug_is_active_impl((n), __FILE__, __FUNCTION__, __LINE__))

/** For optional debugging during codegen, use the debug macro as
 * follows:
 *
 * \code
 * debug(verbosity) << "The expression is " << expr << "\n";
 * \endcode
 *
 * verbosity of 0 always prints, 1 should print after every major
 * stage, 2 should be used for more detail, and 3 should be used for
 * tracing everything that occurs. The verbosity with which to print
 * is determined by the value of the environment variable
 * HL_DEBUG_CODEGEN
 */

#define debug(n)                                     \
    /* NOLINTNEXTLINE(bugprone-macro-parentheses) */ \
    if (debug_is_active((n))) std::cerr

/** Allow easily printing the contents of containers, or std::vector-like containers,
 *  in debug output. Used like so:
 *        std::vector<Type> arg_types;
 *        debug(4) << "arg_types: " << PrintSpan(arg_types) << "\n";
 * Which results in output like "arg_types: { uint8x8, uint8x8 }" on one line. */
template<typename T>
struct PrintSpan {
    const T &span;
    PrintSpan(const T &span)
        : span(span) {
    }
};
// Class template argument deduction (CTAD) guide to prevent warnings.
template<typename T>
PrintSpan(const T &) -> PrintSpan<T>;

template<typename StreamT, typename T>
inline StreamT &operator<<(StreamT &stream, const PrintSpan<T> &wrapper) {
    stream << "{ ";
    const char *sep = "";
    for (const auto &e : wrapper.span) {
        stream << sep << e;
        sep = ", ";
    }
    stream << " }";
    return stream;
}

/** Allow easily printing the contents of spans, or std::vector-like spans,
 *  in debug output. Used like so:
 *        std::vector<Type> arg_types;
 *        debug(4) << "arg_types: " << PrintSpan(arg_types) << "\n";
 * Which results in output like:
 *     arg_types:
 *     {
 *             uint8x8,
 *             uint8x8,
 *     }
 * Indentation uses a tab character. */
template<typename T>
struct PrintSpanLn {
    const T &span;
    PrintSpanLn(const T &span)
        : span(span) {
    }
};
// Class template argument deduction (CTAD) guide to prevent warnings.
template<typename T>
PrintSpanLn(const T &) -> PrintSpanLn<T>;

template<typename StreamT, typename T>
inline StreamT &operator<<(StreamT &stream, const PrintSpanLn<T> &wrapper) {
    stream << "\n{\n";
    for (const auto &e : wrapper.span) {
        stream << "\t" << e << ",\n";
    }
    stream << "}\n";
    return stream;
}

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {

/** Query whether Halide was compiled with exceptions. */
bool exceptions_enabled();

/** A base class for Halide errors.
 *
 * Note that this deliberately does *not* descend from std::runtime_error, or
 * even std::exception; unfortunately, std::runtime_error is not marked as
 * DLLEXPORT on Windows, but Error needs to be marked as such, and mismatching
 * DLLEXPORT annotations in a class inheritance hierarchy in this way can lead
 * to ODR violations. Instead, we just attempt to replicate the API of
 * runtime_error here. */
struct HALIDE_EXPORT_SYMBOL Error {
    Error() = delete;

    // Give each class a non-inlined constructor so that the type
    // doesn't get separately instantiated in each compilation unit.
    explicit Error(const char *msg);
    explicit Error(const std::string &msg);

    Error(const Error &);
    Error &operator=(const Error &);
    Error(Error &&) noexcept;
    Error &operator=(Error &&) noexcept;

    virtual ~Error();

    virtual const char *what() const noexcept;

private:
    // Using a std::string here will cause MSVC to complain about the fact
    // that class std::string isn't declared DLLEXPORT, even though the
    // field is private; rather than suppress the warning, we'll just use
    // an old-fashioned new-and-delete to keep it nice and clean.
    char *what_;
};

/** An error that occurs while running a JIT-compiled Halide pipeline. */
struct HALIDE_EXPORT_SYMBOL RuntimeError final : Error {
    static constexpr auto verbose_name = "Runtime error";
    static constexpr int verbose_debug_level = 1;

    explicit RuntimeError(const char *msg);
    explicit RuntimeError(const std::string &msg);
};

/** An error that occurs while compiling a Halide pipeline that Halide
 * attributes to a user error. */
struct HALIDE_EXPORT_SYMBOL CompileError final : Error {
    static constexpr auto verbose_name = "User error";
    static constexpr int verbose_debug_level = 1;

    explicit CompileError(const char *msg);
    explicit CompileError(const std::string &msg);
};

/** An error that occurs while compiling a Halide pipeline that Halide
 * attributes to an internal compiler bug, or to an invalid use of
 * Halide's internals. */
struct HALIDE_EXPORT_SYMBOL InternalError final : Error {
    static constexpr auto verbose_name = "Internal error";
    static constexpr int verbose_debug_level = 0;  // Always print location/condition info

    explicit InternalError(const char *msg);
    explicit InternalError(const std::string &msg);
};

/** CompileTimeErrorReporter is used at compile time (*not* runtime) when
 * an error or warning is generated by Halide. Note that error() is called
 * a fatal error has occurred, and returning to Halide may cause a crash;
 * implementations of CompileTimeErrorReporter::error() should never return.
 * (Implementations of CompileTimeErrorReporter::warning() may return but
 * may also abort(), exit(), etc.)
 */
class CompileTimeErrorReporter {
public:
    virtual ~CompileTimeErrorReporter() = default;
    virtual void warning(const char *msg) = 0;
    [[noreturn]] virtual void error(const char *msg) = 0;
};

/** The default error reporter logs to stderr, then throws an exception
 * (if HALIDE_WITH_EXCEPTIONS) or calls abort (if not). This allows customization
 * of that behavior if a more gentle response to error reporting is desired.
 * Note that error_reporter is expected to remain valid across all Halide usage;
 * it is up to the caller to ensure that this is the case (and to do any
 * cleanup necessary).
 */
void set_custom_compile_time_error_reporter(CompileTimeErrorReporter *error_reporter);

namespace Internal {

/**
 * If a custom error reporter is configured, notifies the reporter by calling
 * its error() function with the value of \p e.what()
 *
 * Otherwise, if Halide was built with exceptions, throw \p e unless an
 * existing exception is in flight. On the other hand, if Halide was built
 * without exceptions, print the error message to stderr and abort().
 *
 * @param e The error to throw or report
 */
/// @{
[[noreturn]] void throw_error(const RuntimeError &e);
[[noreturn]] void throw_error(const CompileError &e);
[[noreturn]] void throw_error(const InternalError &e);
/// @}

/**
 * If a custom error reporter is configured, notifies the reporter by calling
 * its warning() function. Otherwise, prints the warning to stderr.
 *
 * @param warning The warning to issue
 */
void issue_warning(const char *warning);

template<typename T>
class ReportBase {
    struct Contents {
        std::ostringstream msg{};
        bool finalized{false};
    };
    std::unique_ptr<Contents> contents = std::make_unique<Contents>();

public:
    template<typename S>
    HALIDE_ALWAYS_INLINE T &operator<<(const S &x) {
        contents->msg << x;
        return *static_cast<T *>(this);
    }

    HALIDE_ALWAYS_INLINE operator bool() const {
        return !contents->finalized;
    }

protected:
    // This function is called as part of issue() below. We can't use a
    // virtual function because issue() needs to be marked [[noreturn]]
    // for errors and be left alone for warnings (i.e., they have
    // different signatures).
    std::string finalize_message() {
        if (!contents->msg.str().empty() && contents->msg.str().back() != '\n') {
            contents->msg << "\n";
        }
        contents->finalized = true;
        return contents->msg.str();
    }

    T &init(const char *file, const char *function, const int line, const char *condition_string) {
        if (debug_is_active_impl(T::verbose_debug_level, file, function, line)) {
            contents->msg << T::verbose_name << " at " << file << ":" << line << '\n';
            if (condition_string) {
                contents->msg << "Condition failed: " << condition_string << '\n';
            }
        }
        return *static_cast<T *>(this);
    }
};

template<typename Exception>
struct ErrorReport final : ReportBase<ErrorReport<Exception>> {
    static constexpr auto verbose_name = Exception::verbose_name;
    static constexpr int verbose_debug_level = Exception::verbose_debug_level;

    ErrorReport &init(const char *file, const char *function, const int line, const char *condition_string) {
        return ReportBase<ErrorReport>::init(file, function, line, condition_string) << "Error: ";
    }

    [[noreturn]] void issue() noexcept(false) {
        throw_error(Exception(this->finalize_message()));
    }
};

struct WarningReport final : ReportBase<WarningReport> {
    static constexpr auto verbose_name = "Warning";
    static constexpr int verbose_debug_level = 1;

    WarningReport &init(const char *file, const char *function, const int line, const char *condition_string) {
        return ReportBase::init(file, function, line, condition_string) << "Warning: ";
    }

    void issue() {
        issue_warning(this->finalize_message().c_str());
    }
};

/**
 * The following three diagnostic macros are implemented such that the
 * message is evaluated only if the assertion's value is false.
 *
 * This (regrettably) requires a macro to work, but has the highly desirable
 * effect that all assertion parameters are totally skipped (not ever evaluated)
 * when the assertion is true.
 *
 * The macros work by deferring the call to issue() until after the stream
 * has been evaluated. This previously used a trick where ErrorReport would
 * throw in the destructor, but throwing in a destructor is UB in a lot of
 * scenarios, and it was easy to break things by mistake.
 */
/// @{
#define _halide_error_impl(type)                                    \
    for (Halide::Internal::ErrorReport<type> _err; 1; _err.issue()) \
    /**/ _err.init(__FILE__, __FUNCTION__, __LINE__, nullptr)

#define _halide_assert_impl(condition, type)                            \
    if (!(condition))                                                   \
        for (Halide::Internal::ErrorReport<type> _err; 1; _err.issue()) \
    /*****/ _err.init(__FILE__, __FUNCTION__, __LINE__, #condition)

#define _halide_user_warning                                       \
    for (Halide::Internal::WarningReport _err; _err; _err.issue()) \
    /**/ _err.init(__FILE__, __FUNCTION__, __LINE__, nullptr)
/// @}

#define user_warning _halide_user_warning

#define user_error _halide_error_impl(Halide::CompileError)
#define internal_error _halide_error_impl(Halide::InternalError)
#define halide_runtime_error _halide_error_impl(Halide::RuntimeError)

#define internal_assert(c) _halide_assert_impl(c, Halide::InternalError)
#define user_assert(c) _halide_assert_impl(c, Halide::CompileError)

// The nicely named versions get cleaned up at the end of Halide.h,
// but user code might want to do halide-style user_asserts (e.g. the
// Extern macros introduce calls to user_assert), so for that purpose
// we define an equivalent macro that can be used outside of Halide.h
#define _halide_user_error _halide_error_impl(Halide::CompileError)
#define _halide_internal_error _halide_error_impl(Halide::InternalError)
#define _halide_runtime_error _halide_error_impl(Halide::RuntimeError)
#define _halide_internal_assert(c) _halide_assert_impl(c, Halide::InternalError)
#define _halide_user_assert(c) _halide_assert_impl(c, Halide::CompileError)

// N.B. Any function that might throw a user_assert or user_error may
// not be inlined into the user's code, or the line number will be
// misattributed to Halide.h. Either make such functions internal to
// libHalide, or mark them as HALIDE_NO_USER_CODE_INLINE.

// handler suitable for use with std::terminate; it will catch unhandled exceptions
// and log the `what()` to stderr, then abort. Exposed as a function to minimize
// the need for external code to need to know the definition of Halide::Error.
HALIDE_EXPORT_SYMBOL void unhandled_exception_handler();

}  // namespace Internal

}  // namespace Halide

#endif
#ifndef HALIDE_FLOAT16_H
#define HALIDE_FLOAT16_H

#include <cstdint>
#include <string>

namespace Halide {

/** Class that provides a type that implements half precision
 *  floating point (IEEE754 2008 binary16) in software.
 *
 *  This type is enforced to be 16-bits wide and maintains no state
 *  other than the raw IEEE754 binary16 bits so that it can passed
 *  to code that checks a type's size and used for halide_buffer_t allocation.
 * */
struct float16_t {

    static const int mantissa_bits = 10;
    static const uint16_t sign_mask = 0x8000;
    static const uint16_t exponent_mask = 0x7c00;
    static const uint16_t mantissa_mask = 0x03ff;

    /// \name Constructors
    /// @{

    /** Construct from a float, double, or int using
     * round-to-nearest-ties-to-even. Out-of-range values become +/-
     * infinity.
     */
    // @{
    explicit float16_t(float value);
    explicit float16_t(double value);
    explicit float16_t(int value);
    // @}

    /** Construct a float16_t with the bits initialised to 0. This represents
     * positive zero.*/
    float16_t() = default;

#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
    /** Construct a float16_t from compiler's built-in _Float16 type. */
    explicit float16_t(_Float16 value) {
        memcpy(&data, &value, sizeof(_Float16));
    }
#endif

    /// @}

    // Use explicit to avoid accidently raising the precision
    /** Cast to float */
    explicit operator float() const;
    /** Cast to double */
    explicit operator double() const;
    /** Cast to int */
    explicit operator int() const;

#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
    /** Cast to compiler's built-in _Float16 type. */
    explicit operator _Float16() const {
        _Float16 result;
        memcpy(&result, &data, sizeof(_Float16));
        return result;
    }
#endif

    /** Get a new float16_t that represents a special value */
    // @{
    static float16_t make_zero();
    static float16_t make_negative_zero();
    static float16_t make_infinity();
    static float16_t make_negative_infinity();
    static float16_t make_nan();
    // @}

    /** Get a new float16_t with the given raw bits
     *
     * \param bits The bits conformant to IEEE754 binary16
     */
    static float16_t make_from_bits(uint16_t bits);

    /** Return a new float16_t with a negated sign bit*/
    float16_t operator-() const;

    /** Arithmetic operators. */
    // @{
    float16_t operator+(float16_t rhs) const;
    float16_t operator-(float16_t rhs) const;
    float16_t operator*(float16_t rhs) const;
    float16_t operator/(float16_t rhs) const;
    float16_t operator+=(float16_t rhs) {
        return (*this = *this + rhs);
    }
    float16_t operator-=(float16_t rhs) {
        return (*this = *this - rhs);
    }
    float16_t operator*=(float16_t rhs) {
        return (*this = *this * rhs);
    }
    float16_t operator/=(float16_t rhs) {
        return (*this = *this / rhs);
    }
    // @}

    /** Comparison operators */
    // @{
    bool operator==(float16_t rhs) const;
    bool operator!=(float16_t rhs) const {
        return !(*this == rhs);
    }
    bool operator>(float16_t rhs) const;
    bool operator<(float16_t rhs) const;
    bool operator>=(float16_t rhs) const {
        return (*this > rhs) || (*this == rhs);
    }
    bool operator<=(float16_t rhs) const {
        return (*this < rhs) || (*this == rhs);
    }
    // @}

    /** Properties */
    // @{
    bool is_nan() const;
    bool is_infinity() const;
    bool is_negative() const;
    bool is_zero() const;
    // @}

    /** Returns the bits that represent this float16_t.
     *
     *  An alternative method to access the bits is to cast a pointer
     *  to this instance as a pointer to a uint16_t.
     **/
    uint16_t to_bits() const;

private:
    // The raw bits.
    uint16_t data = 0;
};

static_assert(sizeof(float16_t) == 2, "float16_t should occupy two bytes");

}  // namespace Halide

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<Halide::float16_t>() {
    return halide_type_t(halide_type_float, 16);
}

namespace Halide {

/** Class that provides a type that implements half precision
 *  floating point using the bfloat16 format.
 *
 *  This type is enforced to be 16-bits wide and maintains no state
 *  other than the raw bits so that it can passed to code that checks
 *  a type's size and used for halide_buffer_t allocation. */
struct bfloat16_t {

    static const int mantissa_bits = 7;
    static const uint16_t sign_mask = 0x8000;
    static const uint16_t exponent_mask = 0x7f80;
    static const uint16_t mantissa_mask = 0x007f;

    static const bfloat16_t zero, negative_zero, infinity, negative_infinity, nan;

    /// \name Constructors
    /// @{

    /** Construct from a float, double, or int using
     * round-to-nearest-ties-to-even. Out-of-range values become +/-
     * infinity.
     */
    // @{
    explicit bfloat16_t(float value);
    explicit bfloat16_t(double value);
    explicit bfloat16_t(int value);
    // @}

    /** Construct a bfloat16_t with the bits initialised to 0. This represents
     * positive zero.*/
    bfloat16_t() = default;

    /// @}

    // Use explicit to avoid accidently raising the precision
    /** Cast to float */
    explicit operator float() const;
    /** Cast to double */
    explicit operator double() const;
    /** Cast to int */
    explicit operator int() const;

    /** Get a new bfloat16_t that represents a special value */
    // @{
    static bfloat16_t make_zero();
    static bfloat16_t make_negative_zero();
    static bfloat16_t make_infinity();
    static bfloat16_t make_negative_infinity();
    static bfloat16_t make_nan();
    // @}

    /** Get a new bfloat16_t with the given raw bits
     *
     * \param bits The bits conformant to IEEE754 binary16
     */
    static bfloat16_t make_from_bits(uint16_t bits);

    /** Return a new bfloat16_t with a negated sign bit*/
    bfloat16_t operator-() const;

    /** Arithmetic operators. */
    // @{
    bfloat16_t operator+(bfloat16_t rhs) const;
    bfloat16_t operator-(bfloat16_t rhs) const;
    bfloat16_t operator*(bfloat16_t rhs) const;
    bfloat16_t operator/(bfloat16_t rhs) const;
    bfloat16_t operator+=(bfloat16_t rhs) {
        return (*this = *this + rhs);
    }
    bfloat16_t operator-=(bfloat16_t rhs) {
        return (*this = *this - rhs);
    }
    bfloat16_t operator*=(bfloat16_t rhs) {
        return (*this = *this * rhs);
    }
    bfloat16_t operator/=(bfloat16_t rhs) {
        return (*this = *this / rhs);
    }
    // @}

    /** Comparison operators */
    // @{
    bool operator==(bfloat16_t rhs) const;
    bool operator!=(bfloat16_t rhs) const {
        return !(*this == rhs);
    }
    bool operator>(bfloat16_t rhs) const;
    bool operator<(bfloat16_t rhs) const;
    bool operator>=(bfloat16_t rhs) const {
        return (*this > rhs) || (*this == rhs);
    }
    bool operator<=(bfloat16_t rhs) const {
        return (*this < rhs) || (*this == rhs);
    }
    // @}

    /** Properties */
    // @{
    bool is_nan() const;
    bool is_infinity() const;
    bool is_negative() const;
    bool is_zero() const;
    // @}

    /** Returns the bits that represent this bfloat16_t.
     *
     *  An alternative method to access the bits is to cast a pointer
     *  to this instance as a pointer to a uint16_t.
     **/
    uint16_t to_bits() const;

private:
    // The raw bits.
    uint16_t data = 0;
};

static_assert(sizeof(bfloat16_t) == 2, "bfloat16_t should occupy two bytes");

}  // namespace Halide

template<>
HALIDE_ALWAYS_INLINE constexpr halide_type_t halide_type_of<Halide::bfloat16_t>() {
    return halide_type_t(halide_type_bfloat, 16);
}

#endif
// Always use assert, even if llvm-config defines NDEBUG
#ifdef NDEBUG
#undef NDEBUG
#include <assert.h>
#define NDEBUG
#else
#include <cassert>
#endif

#ifndef HALIDE_UTIL_H
#define HALIDE_UTIL_H

/** \file
 * Various utility functions used internally Halide. */

#include <cmath>
#include <cstdint>
#include <cstring>
#include <functional>
#include <limits>
#include <sstream>
#include <string>
#include <utility>
#include <vector>


#ifdef Halide_STATIC_DEFINE
#define HALIDE_EXPORT
#else
#if defined(_MSC_VER)
// Halide_EXPORTS is quietly defined by CMake when building a shared library
#ifdef Halide_EXPORTS
#define HALIDE_EXPORT __declspec(dllexport)
#else
#define HALIDE_EXPORT __declspec(dllimport)
#endif
#else
#define HALIDE_EXPORT __attribute__((visibility("default")))
#endif
#endif

// If we're in user code, we don't want certain functions to be inlined.
#if defined(COMPILING_HALIDE) || defined(BUILDING_PYTHON)
#define HALIDE_NO_USER_CODE_INLINE
#else
#define HALIDE_NO_USER_CODE_INLINE HALIDE_NEVER_INLINE
#endif

// Clang uses __has_feature() for sanitizers...
#if defined(__has_feature)
#if __has_feature(address_sanitizer)
#define HALIDE_INTERNAL_USING_ASAN
#endif
#if __has_feature(memory_sanitizer)
#define HALIDE_INTERNAL_USING_MSAN
#endif
#if __has_feature(thread_sanitizer)
#define HALIDE_INTERNAL_USING_TSAN
#endif
#if __has_feature(coverage_sanitizer)
#define HALIDE_INTERNAL_USING_COVSAN
#endif
#if __has_feature(undefined_behavior_sanitizer)
#define HALIDE_INTERNAL_USING_UBSAN
#endif
#endif

// ...but GCC/MSVC don't like __has_feature, so handle them separately.
// (Only AddressSanitizer for now, not sure if any others are well-supported
// outside of Clang.
#if defined(__SANITIZE_ADDRESS__) && !defined(HALIDE_INTERNAL_USING_ASAN)
#define HALIDE_INTERNAL_USING_ASAN
#endif

namespace Halide {

/** Load a plugin in the form of a dynamic library (e.g. for custom autoschedulers).
 * If the string doesn't contain any . characters, the proper prefix and/or suffix
 * for the platform will be added:
 *
 *   foo -> libfoo.so (Linux/OSX/etc -- note that .dylib is not supported)
 *   foo -> foo.dll (Windows)
 *
 * otherwise, it is assumed to be an appropriate pathname.
 *
 * Any error in loading will assert-fail. */
void load_plugin(const std::string &lib_name);

namespace Internal {

/** Some numeric conversions are UB if the value won't fit in the result;
 * safe_numeric_cast<>() is meant as a drop-in replacement for a C/C++ cast
 * that adds well-defined behavior for the UB cases, attempting to mimic
 * common implementation behavior as much as possible.
 */
template<typename DST, typename SRC,
         std::enable_if_t<std::is_floating_point_v<SRC>> * = nullptr>
DST safe_numeric_cast(SRC s) {
    if (std::is_integral_v<DST>) {
        // Treat float -> int as a saturating cast; this is handled
        // in different ways by different compilers, so an arbitrary but safe
        // choice like this is reasonable.
        if (s < (SRC)std::numeric_limits<DST>::min()) {
            return std::numeric_limits<DST>::min();
        }
        if (s > (SRC)std::numeric_limits<DST>::max()) {
            return std::numeric_limits<DST>::max();
        }
    }
    return (DST)s;
}

template<typename DST, typename SRC,
         std::enable_if_t<std::is_integral_v<SRC>> * = nullptr>
DST safe_numeric_cast(SRC s) {
    if (std::is_integral_v<DST>) {
        // any-int -> signed-int is technically UB if value won't fit;
        // in practice, common compilers implement such conversions as done below
        // (as verified by exhaustive testing on Clang for x86-64). We could
        // probably continue to rely on that behavior, but making it explicit
        // avoids possible wrather of UBSan and similar debug helpers.
        // (Yes, using sizeof for this comparison is a little odd for the uint->int
        // case, but the intent is to match existing common behavior, which this does.)
        if (std::is_integral_v<SRC> && std::is_signed_v<DST> && sizeof(DST) < sizeof(SRC)) {
            using UnsignedSrc = std::make_unsigned_t<SRC>;
            return (DST)(s & (UnsignedSrc)(-1));
        }
    }
    return (DST)s;
}

/** An aggressive form of reinterpret cast used for correct type-punning. */
template<typename DstType, typename SrcType>
DstType reinterpret_bits(const SrcType &src) {
    static_assert(sizeof(SrcType) == sizeof(DstType), "Types must be same size");
    DstType dst;
    memcpy(&dst, &src, sizeof(SrcType));
    return dst;
}

/** Get value of an environment variable. Returns its value
 * is defined in the environment. If the var is not defined, an empty string
 * is returned.
 */
std::string get_env_variable(char const *env_var_name);

/** Get the name of the currently running executable. Platform-specific.
 * If program name cannot be retrieved, function returns an empty string. */
std::string running_program_name();

/** Generate a unique name starting with the given prefix. It's unique
 * relative to all other strings returned by unique_name in this
 * process.
 *
 * The single-character version always appends a numeric suffix to the
 * character.
 *
 * The string version will either return the input as-is (with high
 * probability on the first time it is called with that input), or
 * replace any existing '$' characters with underscores, then add a
 * '$' sign and a numeric suffix to it.
 *
 * Note that unique_name('f') therefore differs from
 * unique_name("f"). The former returns something like f123, and the
 * latter returns either f or f$123.
 */
// @{
std::string unique_name(char prefix);
std::string unique_name(const std::string &prefix);
// @}

/** Test if the first string starts with the second string */
bool starts_with(const std::string &str, const std::string &prefix);

/** Test if the first string ends with the second string */
bool ends_with(const std::string &str, const std::string &suffix);

/** Replace all matches of the second string in the first string with the last string.
 * The string to search-and-replace in is passed by value, offering the ability to
 * std::move() a string in if you're not interested in keeping the original string.
 * This is useful when the original string does not contain the find-string, causing
 * this function to return the same string without any copies being made. */
std::string replace_all(std::string str, const std::string &find, const std::string &replace);

/** Split the source string using 'delim' as the divider. */
std::vector<std::string> split_string(const std::string &source, const std::string &delim);

/** Join the source vector using 'delim' as the divider. */
template<typename T>
std::string join_strings(const std::vector<T> &sources, const std::string &delim) {
    size_t sz = 0;
    if (!sources.empty()) {
        sz += delim.size() * (sources.size() - 1);
    }
    for (const auto &s : sources) {
        sz += s.size();
    }
    std::string result;
    result.reserve(sz);
    bool need_delim = false;
    for (const auto &s : sources) {
        if (need_delim) {
            result += delim;
        }
        result += s;
        need_delim = true;
    }
    return result;
}

template<typename... Args>
std::string concat_strings(Args &&...args) {
    std::stringstream ss;
    (ss << ... << args);
    return ss.str();
}

/** Perform a left fold of a vector. Returns a default-constructed
 * vector element if the vector is empty. Similar to std::accumulate
 * but with a less clunky syntax. */
template<typename T, typename Fn>
T fold_left(const std::vector<T> &vec, Fn f) {
    T result;
    if (vec.empty()) {
        return result;
    }
    result = vec[0];
    for (size_t i = 1; i < vec.size(); i++) {
        result = f(result, vec[i]);
    }
    return result;
}

/** Returns a right fold of a vector. Returns a default-constructed
 * vector element if the vector is empty. */
template<typename T, typename Fn>
T fold_right(const std::vector<T> &vec, Fn f) {
    T result;
    if (vec.empty()) {
        return result;
    }
    result = vec.back();
    for (size_t i = vec.size() - 1; i > 0; i--) {
        result = f(vec[i - 1], result);
    }
    return result;
}

template<typename... T>
struct meta_and : std::true_type {};

template<typename T1, typename... Args>
struct meta_and<T1, Args...> : std::integral_constant<bool, T1::value && meta_and<Args...>::value> {};

template<typename... T>
struct meta_or : std::false_type {};

template<typename T1, typename... Args>
struct meta_or<T1, Args...> : std::integral_constant<bool, T1::value || meta_or<Args...>::value> {};

template<typename To, typename... Args>
struct all_are_convertible : meta_and<std::is_convertible<Args, To>...> {};

/** Returns base name and fills in namespaces, outermost one first in vector. */
std::string extract_namespaces(const std::string &name, std::vector<std::string> &namespaces);

/** Like extract_namespaces(), but strip and discard the namespaces, returning base name only */
std::string strip_namespaces(const std::string &name);

struct FileStat {
    uint64_t file_size;
    uint32_t mod_time;  // Unix epoch time
    uint32_t uid;
    uint32_t gid;
    uint32_t mode;
};

/** Create a unique file with a name of the form prefixXXXXXsuffix in an arbitrary
 * (but writable) directory; this is typically /tmp, but the specific
 * location is not guaranteed. (Note that the exact form of the file name
 * may vary; in particular, the suffix may be ignored on Windows.)
 * The file is created (but not opened), thus this can be called from
 * different threads (or processes, e.g. when building with parallel make)
 * without risking collision. Note that if this file is used as a temporary
 * file, the caller is responsibly for deleting it. Neither the prefix nor suffix
 * may contain a directory separator.
 */
std::string file_make_temp(const std::string &prefix, const std::string &suffix);

/** Create a unique directory in an arbitrary (but writable) directory; this is
 * typically somewhere inside /tmp, but the specific location is not guaranteed.
 * The directory will be empty (i.e., this will never return /tmp itself,
 * but rather a new directory inside /tmp). The caller is responsible for removing the
 * directory after use.
 */
std::string dir_make_temp();

/** Wrapper for access(). Quietly ignores errors. */
bool file_exists(const std::string &name);

/** assert-fail if the file doesn't exist. useful primarily for testing purposes. */
void assert_file_exists(const std::string &name);

/** assert-fail if the file DOES exist. useful primarily for testing purposes. */
void assert_no_file_exists(const std::string &name);

/** Wrapper for unlink(). Asserts upon error. */
void file_unlink(const std::string &name);

/** Wrapper for unlink(). Quietly ignores errors. */
void file_unlink(const std::string &name);

/** Ensure that no file with this path exists. If such a file
 * exists and cannot be removed, assert-fail. */
void ensure_no_file_exists(const std::string &name);

/** Wrapper for rmdir(). Asserts upon error. */
void dir_rmdir(const std::string &name);

/** Wrapper for stat(). Asserts upon error. */
FileStat file_stat(const std::string &name);

/** Read the entire contents of a file into a vector<char>. The file
 * is read in binary mode. Errors trigger an assertion failure. */
std::vector<char> read_entire_file(const std::string &pathname);

/** Create or replace the contents of a file with a given pointer-and-length
 * of memory. If the file doesn't exist, it is created; if it does exist, it
 * is completely overwritten. Any error triggers an assertion failure. */
void write_entire_file(const std::string &pathname, const void *source, size_t source_len);

inline void write_entire_file(const std::string &pathname, const std::vector<char> &source) {
    write_entire_file(pathname, source.data(), source.size());
}

/** A simple utility class that creates a temporary file in its ctor and
 * deletes that file in its dtor; this is useful for temporary files that you
 * want to ensure are deleted when exiting a certain scope. Since this is essentially
 * just an RAII wrapper around file_make_temp() and file_unlink(), it has the same
 * failure modes (i.e.: assertion upon error).
 */
class TemporaryFile final {
public:
    TemporaryFile(const std::string &prefix, const std::string &suffix)
        : temp_path(file_make_temp(prefix, suffix)) {
    }
    const std::string &pathname() const {
        return temp_path;
    }
    ~TemporaryFile() {
        if (do_unlink) {
            file_unlink(temp_path);
        }
    }
    // You can call this if you want to defeat the automatic deletion;
    // this is rarely what you want to do (since it defeats the purpose
    // of this class), but can be quite handy for debugging purposes.
    void detach() {
        do_unlink = false;
    }

private:
    const std::string temp_path;
    bool do_unlink = true;

public:
    TemporaryFile(const TemporaryFile &) = delete;
    TemporaryFile &operator=(const TemporaryFile &) = delete;
    TemporaryFile(TemporaryFile &&) = delete;
    TemporaryFile &operator=(TemporaryFile &&) = delete;
};

/** Routines to test if math would overflow for signed integers with
 * the given number of bits. */
// @{
bool add_would_overflow(int bits, int64_t a, int64_t b);
bool sub_would_overflow(int bits, int64_t a, int64_t b);
bool mul_would_overflow(int bits, int64_t a, int64_t b);
// @}

/** Routines to perform arithmetic on signed types without triggering signed
 * overflow. If overflow would occur, sets result to zero, and returns
 * false. Otherwise set result to the correct value, and returns true. */
// @{
HALIDE_MUST_USE_RESULT bool add_with_overflow(int bits, int64_t a, int64_t b, int64_t *result);
HALIDE_MUST_USE_RESULT bool sub_with_overflow(int bits, int64_t a, int64_t b, int64_t *result);
HALIDE_MUST_USE_RESULT bool mul_with_overflow(int bits, int64_t a, int64_t b, int64_t *result);
// @}

/** Helper class for saving/restoring variable values on the stack, to allow
 * for early-exit that preserves correctness */
template<typename T>
struct ScopedValue {
    T &var;
    T old_value;
    /** Preserve the old value, restored at dtor time */
    ScopedValue(T &var)
        : var(var), old_value(var) {
    }
    /** Preserve the old value, then set the var to a new value. */
    ScopedValue(T &var, const T &new_value)
        : var(var), old_value(var) {
        var = new_value;
    }
    ~ScopedValue() {
        var = old_value;
    }
    operator T() const {
        return old_value;
    }
    // allow move but not copy
    ScopedValue(const ScopedValue &that) = delete;
    ScopedValue(ScopedValue &&that) noexcept = default;
};

// Helpers for timing blocks of code. Put 'TIC;' at the start and
// 'TOC;' at the end. Timing is reported at the toc via
// debug(0). The calls can be nested and will pretty-print
// appropriately. Took this idea from matlab via Jon Barron.
//
// Note that this uses global state internally, and is not thread-safe
// at all. Only use it for single-threaded debugging sessions.

void halide_tic_impl(const char *file, int line);
void halide_toc_impl(const char *file, int line);
#define HALIDE_TIC Halide::Internal::halide_tic_impl(__FILE__, __LINE__)
#define HALIDE_TOC Halide::Internal::halide_toc_impl(__FILE__, __LINE__)
#ifdef COMPILING_HALIDE
#define TIC HALIDE_TIC
#define TOC HALIDE_TOC
#endif

// statically cast a value from one type to another: this is really just
// some syntactic sugar around static_cast<>() to avoid compiler warnings
// regarding 'bool' in some compliation configurations.
template<typename TO>
struct StaticCast {
    template<typename FROM>
    constexpr static TO value(const FROM &from) {
        if constexpr (std::is_same_v<TO, bool>) {
            return from != 0;
        } else {
            return static_cast<TO>(from);
        }
    }
};

// Like std::is_convertible, but with additional tests for arithmetic types:
// ensure that the value will roundtrip losslessly (e.g., no integer truncation
// or dropping of fractional parts).
template<typename TO>
struct IsRoundtrippable {
    template<typename FROM>
    constexpr static bool value(const FROM &from) {
        if constexpr (std::is_convertible_v<FROM, TO>) {
            if constexpr (std::is_arithmetic_v<TO> &&
                          std::is_arithmetic_v<FROM> &&
                          !std::is_same_v<TO, FROM>) {
                const TO to = static_cast<TO>(from);
                const FROM roundtripped = static_cast<FROM>(to);
                return roundtripped == from;
            } else {
                return true;
            }
        } else {
            return false;
        }
    }
};

template<typename T>
struct reverse_adaptor {
    T &range;
};

template<typename T>
auto begin(reverse_adaptor<T> i) {
    return std::rbegin(i.range);
}

template<typename T>
auto end(reverse_adaptor<T> i) {
    return std::rend(i.range);
}

/**
 * Reverse-order adaptor for range-based for-loops.
 * TODO: Replace with std::ranges::reverse_view when upgrading to C++20.
 */
template<typename T>
reverse_adaptor<T> reverse_view(T &&range) {
    return {range};
}

/** Emit a version of a string that is a valid identifier in C (. is replaced with _)
 * If prefix_underscore is true (the default), an underscore will be prepended if the
 * input starts with an alphabetic character to avoid reserved word clashes.
 */
std::string c_print_name(const std::string &name, bool prefix_underscore = true);

/** Return the LLVM_VERSION against which this libHalide is compiled. This is provided
 * only for internal tests which need to verify behavior; please don't use this outside
 * of Halide tests. */
int get_llvm_version();

}  // namespace Internal

/** Set how much stack the compiler should use for compilation in
 * bytes. This can also be set through the environment variable
 * HL_COMPILER_STACK_SIZE, though this function takes precedence. A
 * value of zero causes the compiler to just use the calling stack for
 * all compilation tasks.
 *
 * Calling this or setting the environment variable should not be
 * necessary. It is provided for three kinds of testing:
 *
 * First, Halide uses it in our internal tests to make sure
 * we're not using a silly amount of stack size on some
 * canary programs to avoid stack usage regressions.
 *
 * Second, if you have a mysterious crash inside a generator, you can
 * set a larger stack size as a way to test if it's a stack
 * overflow. Perhaps our default stack size is not large enough for
 * your program and schedule. Use this call or the environment var as
 * a workaround, and then open a bug with a reproducer at
 * github.com/halide/Halide/issues so that we can determine what's
 * going wrong that is causing your code to use so much stack.
 *
 * Third, perhaps using a side-stack is causing problems with
 * sanitizing, debugging, or profiling tools. If this is a problem,
 * you can set HL_COMPILER_STACK_SIZE to zero to make Halide stay on
 * the main thread's stack.
 */
void set_compiler_stack_size(size_t);

/** The default amount of stack used for lowering and codegen. 32 MB
 * ought to be enough for anyone. */
constexpr size_t default_compiler_stack_size = 32 * 1024 * 1024;

/** Return how much stack size the compiler should use for calls that
 * go through run_with_large_stack below. Currently that's lowering
 * and codegen. If no call to set_compiler_stack_size has been made,
 * this checks the value of the environment variable
 * HL_COMPILER_STACK_SIZE. If that's unset, it returns
 * default_compiler_stack_size, defined above. */
size_t get_compiler_stack_size();

namespace Internal {

/** Call the given action in a platform-specific context that
 * provides at least the stack space returned by
 * get_compiler_stack_size. If that value is zero, just calls the
 * function on the calling thread. Otherwise on Windows this
 * uses a Fiber, and on other platforms it uses swapcontext. */
void run_with_large_stack(const std::function<void()> &action);

/** Portable versions of popcount, count-leading-zeros, and
    count-trailing-zeros. */
// @{
int popcount64(uint64_t x);
int clz64(uint64_t x);
int ctz64(uint64_t x);
// @}

/** Return an integer 2^n, for some n,  which is >= x. Argument x must be > 0. */
inline int64_t next_power_of_two(int64_t x) {
    return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
}

template<typename T>
inline T align_up(T x, int n) {
    return (x + n - 1) / n * n;
}

}  // namespace Internal
}  // namespace Halide

#endif
#include <cstdint>

/** \file
 * Defines halide types
 */

/** A set of types to represent a C++ function signature. This allows
 * two things.  First, proper prototypes can be provided for Halide
 * generated functions, giving better compile time type
 * checking. Second, C++ name mangling can be done to provide link
 * time type checking for both Halide generated functions and calls
 * from Halide to external functions.
 *
 * These are intended to be constexpr producable.
 *
 * halide_handle_traits has to go outside the Halide namespace due to template
 * resolution rules. TODO(zalman): Do all types need to be in global namespace?
 */
//@{

/** A structure to represent the (unscoped) name of a C++ composite type for use
 * as a single argument (or return value) in a function signature.
 *
 * Currently does not support the restrict qualifier, references, or
 * r-value references.  These features cannot be used in extern
 * function calls from Halide or in the generated function from
 * Halide, but their applicability seems limited anyway.
 *
 * Although this is in the global namespace, it should be considered "Halide Internal"
 * and subject to change; code outside Halide should avoid referencing it.
 */
struct halide_cplusplus_type_name {
    /// An enum to indicate whether a C++ type is non-composite, a struct, class, or union
    enum CPPTypeType {
        Simple,       ///< "int"
        Struct,       ///< "struct Foo"
        Class,        ///< "class Foo"
        Union,        ///< "union Foo"
        Enum,         ///< "enum Foo"
    } cpp_type_type;  // Note: order is reflected in map_to_name table in CPlusPlusMangle.cpp

    std::string name;

    halide_cplusplus_type_name(CPPTypeType cpp_type_type, const std::string &name)
        : cpp_type_type(cpp_type_type), name(name) {
    }

    bool operator==(const halide_cplusplus_type_name &rhs) const {
        return cpp_type_type == rhs.cpp_type_type &&
               name == rhs.name;
    }

    bool operator!=(const halide_cplusplus_type_name &rhs) const {
        return !(*this == rhs);
    }

    bool operator<(const halide_cplusplus_type_name &rhs) const {
        return cpp_type_type < rhs.cpp_type_type ||
               (cpp_type_type == rhs.cpp_type_type &&
                name < rhs.name);
    }
};

/** A structure to represent the fully scoped name of a C++ composite
 * type for use in generating function signatures that use that type.
 *
 * This is intended to be a constexpr usable type.
 *
 * Although this is in the global namespace, it should be considered "Halide Internal"
 * and subject to change; code outside Halide should avoid referencing it.
 */
struct halide_handle_cplusplus_type {
    halide_cplusplus_type_name inner_name;
    std::vector<std::string> namespaces;
    std::vector<halide_cplusplus_type_name> enclosing_types;

    /// One set of modifiers on a type.
    /// The const/volatile/restrict properties are "inside" the pointer property.
    enum Modifier : uint8_t {
        Const = 1 << 0,            ///< Bitmask flag for "const"
        Volatile = 1 << 1,         ///< Bitmask flag for "volatile"
        Restrict = 1 << 2,         ///< Bitmask flag for "restrict"
        Pointer = 1 << 3,          ///< Bitmask flag for a pointer "*"
        FunctionTypedef = 1 << 4,  ///< Bitmask flag for a function typedef; when this is set, Pointer should also always be set
    };

    /// Qualifiers and indirections on type. 0 is innermost.
    std::vector<uint8_t> cpp_type_modifiers;

    /// References are separate because they only occur at the outermost level.
    /// No modifiers are needed for references as they are not allowed to apply
    /// to the reference itself. (This isn't true for restrict, but that is a C++
    /// extension anyway.) If modifiers are needed, the last entry in the above
    /// array would be the modifers for the reference.
    enum ReferenceType : uint8_t {
        NotReference = 0,
        LValueReference = 1,  // "&"
        RValueReference = 2,  // "&&"
    };
    ReferenceType reference_type;

    halide_handle_cplusplus_type(const halide_cplusplus_type_name &inner_name,
                                 const std::vector<std::string> &namespaces = {},
                                 const std::vector<halide_cplusplus_type_name> &enclosing_types = {},
                                 const std::vector<uint8_t> &modifiers = {},
                                 ReferenceType reference_type = NotReference)
        : inner_name(inner_name),
          namespaces(namespaces),
          enclosing_types(enclosing_types),
          cpp_type_modifiers(modifiers),
          reference_type(reference_type) {
    }

    template<typename T>
    static halide_handle_cplusplus_type make();
};
//@}

/** halide_c_type_to_name is a utility class used to provide a user-extensible
 * way of naming Handle types.
 *
 * Although this is in the global namespace, it should be considered "Halide Internal"
 * and subject to change; code outside Halide should avoid referencing it
 * directly (use the HALIDE_DECLARE_EXTERN_xxx macros instead).
 */
template<typename T>
struct halide_c_type_to_name {
    static constexpr bool known_type = false;
    static halide_cplusplus_type_name name() {
        return {halide_cplusplus_type_name::Simple, "void"};
    }
};

#define HALIDE_DECLARE_EXTERN_TYPE(TypeType, Type)                \
    template<>                                                    \
    struct halide_c_type_to_name<Type> {                          \
        static constexpr bool known_type = true;                  \
        static halide_cplusplus_type_name name() {                \
            return {halide_cplusplus_type_name::TypeType, #Type}; \
        }                                                         \
    }

#define HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Simple, T)
#define HALIDE_DECLARE_EXTERN_STRUCT_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Struct, T)
#define HALIDE_DECLARE_EXTERN_CLASS_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Class, T)
#define HALIDE_DECLARE_EXTERN_UNION_TYPE(T) HALIDE_DECLARE_EXTERN_TYPE(Union, T)

HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(char);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(bool);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int8_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint8_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int16_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint16_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int32_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint32_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(int64_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(uint64_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::float16_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(Halide::bfloat16_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_task_t);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(halide_loop_task_t);
#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(_Float16);
#endif
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(float);
HALIDE_DECLARE_EXTERN_SIMPLE_TYPE(double);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_buffer_t);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_dimension_t);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_device_interface_t);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_filter_metadata_t);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_semaphore_t);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_semaphore_acquire_t);
HALIDE_DECLARE_EXTERN_STRUCT_TYPE(halide_parallel_task_t);

// You can make arbitrary user-defined types be "Known" using the
// macro above. This is useful for making Param<> arguments for
// Generators type safe. e.g.,
//
//    struct MyFunStruct { ... };
//
//    ...
//
//    HALIDE_DECLARE_EXTERN_STRUCT_TYPE(MyFunStruct);
//
//    ...
//
//    class MyGenerator : public Generator<MyGenerator> {
//       Param<const MyFunStruct *> my_struct_ptr;
//       ...
//    };

template<typename T>
/*static*/ halide_handle_cplusplus_type halide_handle_cplusplus_type::make() {
    constexpr bool is_ptr = std::is_pointer_v<T>;
    constexpr bool is_lvalue_reference = std::is_lvalue_reference_v<T>;
    constexpr bool is_rvalue_reference = std::is_rvalue_reference_v<T>;

    using TNoRef = std::remove_reference_t<T>;
    using TNoRefNoPtr = std::remove_pointer_t<TNoRef>;
    constexpr bool is_function_pointer = std::is_pointer_v<TNoRef> &&
                                         std::is_function_v<TNoRefNoPtr>;

    // Don't remove the pointer-ness from a function pointer.
    using TBase = std::conditional_t<is_function_pointer, TNoRef, TNoRefNoPtr>;
    constexpr bool is_const = std::is_const_v<TBase>;
    constexpr bool is_volatile = std::is_volatile_v<TBase>;

    constexpr uint8_t modifiers = static_cast<uint8_t>(
        (is_function_pointer ? FunctionTypedef : 0) |
        (is_ptr ? Pointer : 0) |
        (is_const ? Const : 0) |
        (is_volatile ? Volatile : 0));

    constexpr ReferenceType ref_type =
        (is_lvalue_reference ? LValueReference :
         is_rvalue_reference ? RValueReference :
                               NotReference);

    using TNonCVBase = std::remove_cv_t<TBase>;
    constexpr bool known_type = halide_c_type_to_name<TNonCVBase>::known_type;
    static_assert(!(!known_type && !is_ptr), "Unknown types must be pointers");

    halide_handle_cplusplus_type info = {
        halide_c_type_to_name<TNonCVBase>::name(),
        {},
        {},
        {modifiers},
        ref_type};
    // Pull off any namespaces
    info.inner_name.name = Halide::Internal::extract_namespaces(info.inner_name.name, info.namespaces);
    return info;
}

/** A type traits template to provide a halide_handle_cplusplus_type
 * value from a C++ type.
 *
 * Note the type represented is implicitly a pointer.
 *
 * A NULL pointer of type halide_handle_traits represents "void *".
 * This is chosen for compactness or representation as Type is a very
 * widely used data structure.
 *
 * Although this is in the global namespace, it should be considered "Halide Internal"
 * and subject to change; code outside Halide should avoid referencing it directly.
 */
template<typename T>
struct halide_handle_traits {
    // This trait must return a pointer to a global structure. I.e. it should never be freed.
    // A return value of nullptr here means "void *".
    HALIDE_ALWAYS_INLINE static const halide_handle_cplusplus_type *type_info() {
        if (std::is_pointer_v<T> ||
            std::is_lvalue_reference_v<T> ||
            std::is_rvalue_reference_v<T>) {
            static const halide_handle_cplusplus_type the_info = halide_handle_cplusplus_type::make<T>();
            return &the_info;
        }
        return nullptr;
    }
};

namespace Halide {

namespace Internal {
struct ConstantInterval;
}

struct Expr;

/** Types in the halide type system. They can be ints, unsigned ints,
 * or floats of various bit-widths (the 'bits' field). They can also
 * be vectors of the same (by setting the 'lanes' field to something
 * larger than one). Front-end code shouldn't use vector
 * types. Instead vectorize a function. */
struct Type {
private:
    halide_type_t type;

public:
    /** Aliases for halide_type_code_t values for legacy compatibility
     * and to match the Halide internal C++ style. */
    // @{
    static constexpr halide_type_code_t Int = halide_type_int;
    static constexpr halide_type_code_t UInt = halide_type_uint;
    static constexpr halide_type_code_t Float = halide_type_float;
    static constexpr halide_type_code_t BFloat = halide_type_bfloat;
    static constexpr halide_type_code_t Handle = halide_type_handle;
    // @}

    /** The number of bytes required to store a single scalar value of this type. Ignores vector lanes. */
    int bytes() const {
        return (bits() + 7) / 8;
    }

    // Default ctor initializes everything to predictable-but-unlikely values
    Type()
        : type(Handle, 0, 0) {
    }

    /** Construct a runtime representation of a Halide type from:
     * code: The fundamental type from an enum.
     * bits: The bit size of one element.
     * lanes: The number of vector elements in the type. */
    Type(halide_type_code_t code, int bits, int lanes, const halide_handle_cplusplus_type *handle_type = nullptr)
        : type(code, (uint8_t)bits, (uint16_t)lanes), handle_type(handle_type) {
        user_assert(lanes == type.lanes)
            << "Halide only supports vector types with up to 65535 lanes. " << lanes << " lanes requested.";
        user_assert(bits == type.bits)
            << "Halide only supports types with up to 255 bits. " << bits << " bits requested.";
    }

    /** Trivial copy constructor. */
    Type(const Type &that) = default;

    /** Trivial copy assignment operator. */
    Type &operator=(const Type &that) = default;

    /** Type is a wrapper around halide_type_t with more methods for use
     * inside the compiler. This simply constructs the wrapper around
     * the runtime value. */
    HALIDE_ALWAYS_INLINE
    Type(const halide_type_t &that, const halide_handle_cplusplus_type *handle_type = nullptr)
        : type(that), handle_type(handle_type) {
    }

    /** Unwrap the runtime halide_type_t for use in runtime calls, etc.
     * Representation is exactly equivalent. */
    HALIDE_ALWAYS_INLINE
    operator halide_type_t() const {
        return type;
    }

    /** Return the underlying data type of an element as an enum value. */
    HALIDE_ALWAYS_INLINE
    halide_type_code_t code() const {
        return (halide_type_code_t)type.code;
    }

    /** Return the bit size of a single element of this type. */
    HALIDE_ALWAYS_INLINE
    int bits() const {
        return type.bits;
    }

    /** Return the number of vector elements in this type. */
    HALIDE_ALWAYS_INLINE
    int lanes() const {
        return type.lanes;
    }

    /** Return Type with same number of bits and lanes, but new_code for a type code. */
    Type with_code(halide_type_code_t new_code) const {
        return Type(new_code, bits(), lanes(),
                    (new_code == code()) ? handle_type : nullptr);
    }

    /** Return Type with same type code and lanes, but new_bits for the number of bits. */
    Type with_bits(int new_bits) const {
        return Type(code(), new_bits, lanes(),
                    (new_bits == bits()) ? handle_type : nullptr);
    }

    /** Return Type with same type code and number of bits,
     * but new_lanes for the number of vector lanes. */
    Type with_lanes(int new_lanes) const {
        return Type(code(), bits(), new_lanes, handle_type);
    }

    /** Return Type with the same type code and number of lanes, but with at least twice as many bits. */
    Type widen() const {
        if (bits() == 1) {
            // Widening a 1-bit type should produce an 8-bit type.
            return with_bits(8);
        } else {
            return with_bits(bits() * 2);
        }
    }

    /** Return Type with the same type code and number of lanes, but with at most half as many bits. */
    Type narrow() const {
        internal_assert(bits() != 1) << "Attempting to narrow a 1-bit type\n";
        if (bits() == 8) {
            // Narrowing an 8-bit type should produce a 1-bit type.
            return with_bits(1);
        } else {
            return with_bits(bits() / 2);
        }
    }

    /** Type to be printed when declaring handles of this type. */
    const halide_handle_cplusplus_type *handle_type = nullptr;

    /** Is this type boolean (represented as UInt(1))? */
    HALIDE_ALWAYS_INLINE
    bool is_bool() const {
        return code() == UInt && bits() == 1;
    }

    /** Is this type a vector type? (lanes() != 1).
     * TODO(abadams): Decide what to do for lanes() == 0. */
    HALIDE_ALWAYS_INLINE
    bool is_vector() const {
        return lanes() != 1;
    }

    /** Is this type a scalar type? (lanes() == 1).
     * TODO(abadams): Decide what to do for lanes() == 0. */
    HALIDE_ALWAYS_INLINE
    bool is_scalar() const {
        return lanes() == 1;
    }

    /** Is this type a floating point type (float or double). */
    HALIDE_ALWAYS_INLINE
    bool is_float() const {
        return code() == Float || code() == BFloat;
    }

    /** Is this type a floating point type (float or double). */
    HALIDE_ALWAYS_INLINE
    bool is_bfloat() const {
        return code() == BFloat;
    }

    /** Is this type a signed integer type? */
    HALIDE_ALWAYS_INLINE
    bool is_int() const {
        return code() == Int;
    }

    /** Is this type an unsigned integer type? */
    HALIDE_ALWAYS_INLINE
    bool is_uint() const {
        return code() == UInt;
    }

    /** Is this type an integer type of any sort? */
    HALIDE_ALWAYS_INLINE
    bool is_int_or_uint() const {
        return code() == Int || code() == UInt;
    }

    /** Is this type an opaque handle type (void *) */
    HALIDE_ALWAYS_INLINE
    bool is_handle() const {
        return code() == Handle;
    }

    // Returns true iff type is a signed integral type where overflow is defined.
    HALIDE_ALWAYS_INLINE
    bool can_overflow_int() const {
        return is_int() && bits() <= 16;
    }

    // Returns true iff type does have a well-defined overflow behavior.
    HALIDE_ALWAYS_INLINE
    bool can_overflow() const {
        return is_uint() || can_overflow_int();
    }

    /** Check that the type name of two handles matches. */
    bool same_handle_type(const Type &other) const;

    /** Compare two types for equality */
    bool operator==(const Type &other) const {
        return type == other.type && (code() != Handle || same_handle_type(other));
    }

    /** Compare two types for inequality */
    bool operator!=(const Type &other) const {
        return type != other.type || (code() == Handle && !same_handle_type(other));
    }

    /** Compare two types for equality */
    bool operator==(const halide_type_t &other) const {
        return type == other;
    }

    /** Compare two types for inequality */
    bool operator!=(const halide_type_t &other) const {
        return type != other;
    }

    /** Compare ordering of two types so they can be used in certain containers and algorithms */
    bool operator<(const Type &other) const {
        if (type < other.type) {
            return true;
        }
        if (code() == Handle) {
            return handle_type < other.handle_type;
        }
        return false;
    }

    /** Produce the scalar type (that of a single element) of this vector type */
    Type element_of() const {
        return with_lanes(1);
    }

    /** Can this type represent all values of another type? */
    bool can_represent(Type other) const;

    /** Can this type represent exactly all integer values of some constant
     * integer range? */
    bool can_represent(const Internal::ConstantInterval &in) const;

    /** Can this type represent a particular constant? */
    // @{
    bool can_represent(double x) const;
    bool can_represent(int64_t x) const;
    bool can_represent(uint64_t x) const;
    // @}

    /** Check if an integer constant value is the maximum or minimum
     * representable value for this type. */
    // @{
    bool is_max(uint64_t) const;
    bool is_max(int64_t) const;
    bool is_min(uint64_t) const;
    bool is_min(int64_t) const;
    // @}

    /** Return an expression which is the maximum value of this type.
     * Returns infinity for types which can represent it. */
    Expr max() const;

    /** Return an expression which is the minimum value of this type.
     * Returns -infinity for types which can represent it. */
    Expr min() const;
};

/** Constructing a signed integer type */
inline Type Int(int bits, int lanes = 1) {
    return Type(Type::Int, bits, lanes);
}

/** Constructing an unsigned integer type */
inline Type UInt(int bits, int lanes = 1) {
    return Type(Type::UInt, bits, lanes);
}

/** Construct a floating-point type */
inline Type Float(int bits, int lanes = 1) {
    return Type(Type::Float, bits, lanes);
}

/** Construct a floating-point type in the bfloat format. Only 16-bit currently supported. */
inline Type BFloat(int bits, int lanes = 1) {
    return Type(Type::BFloat, bits, lanes);
}

/** Construct a boolean type */
inline Type Bool(int lanes = 1) {
    return UInt(1, lanes);
}

/** Construct a handle type */
inline Type Handle(int lanes = 1, const halide_handle_cplusplus_type *handle_type = nullptr) {
    return Type(Type::Handle, 64, lanes, handle_type);
}

/** Construct the halide equivalent of a C type */
template<typename T>
inline Type type_of() {
    return Type(halide_type_of<T>(), halide_handle_traits<T>::type_info());
}

/** Halide type to a C++ type */
std::string type_to_c_type(Type type, bool include_space, bool c_plus_plus = true);

}  // namespace Halide

#endif

namespace Halide {

struct bfloat16_t;
struct float16_t;

namespace Internal {

class IRMutator;
class IRVisitor;

/** All our IR node types get unique IDs for the purposes of RTTI */
enum class IRNodeType {
    // Exprs, in order of strength. Code in IRMatch.h and the
    // simplifier relies on this order for canonicalization of
    // expressions, so you may need to update those modules if you
    // change this list.
    IntImm,
    UIntImm,
    FloatImm,
    StringImm,
    Broadcast,
    Cast,
    Reinterpret,
    Variable,
    Add,
    Sub,
    Mod,
    Mul,
    Div,
    Min,
    Max,
    EQ,
    NE,
    LT,
    LE,
    GT,
    GE,
    And,
    Or,
    Not,
    Select,
    Load,
    Ramp,
    Call,
    Let,
    Shuffle,
    VectorReduce,
    // Stmts
    LetStmt,
    AssertStmt,
    ProducerConsumer,
    For,
    Acquire,
    Store,
    Provide,
    Allocate,
    Free,
    Realize,
    Block,
    Fork,
    IfThenElse,
    Evaluate,
    Prefetch,
    Atomic,
    HoistedStorage
};

constexpr IRNodeType StrongestExprNodeType = IRNodeType::VectorReduce;

/** The abstract base classes for a node in the Halide IR. */
struct IRNode {

    /** We use the visitor pattern to traverse IR nodes throughout the
     * compiler, so we have a virtual accept method which accepts
     * visitors.
     */
    virtual void accept(IRVisitor *v) const = 0;
    IRNode(IRNodeType t)
        : node_type(t) {
    }
    virtual ~IRNode() = default;

    /** These classes are all managed with intrusive reference
     * counting, so we also track a reference count. It's mutable
     * so that we can do reference counting even through const
     * references to IR nodes.
     */
    mutable RefCount ref_count;

    /** Each IR node subclass has a unique identifier. We can compare
     * these values to do runtime type identification. We don't
     * compile with rtti because that injects run-time type
     * identification stuff everywhere (and often breaks when linking
     * external libraries compiled without it), and we only want it
     * for IR nodes. One might want to put this value in the vtable,
     * but that adds another level of indirection, and for Exprs we
     * have 32 free bits in between the ref count and the Type
     * anyway, so this doesn't increase the memory footprint of an IR node.
     */
    IRNodeType node_type;
};

template<>
inline RefCount &ref_count<IRNode>(const IRNode *t) noexcept {
    return t->ref_count;
}

template<>
inline void destroy<IRNode>(const IRNode *t) {
    delete t;
}

/** IR nodes are split into expressions and statements. These are
   similar to expressions and statements in C - expressions
   represent some value and have some type (e.g. x + 3), and
   statements are side-effecting pieces of code that do not
   represent a value (e.g. assert(x > 3)) */

/** A base class for statement nodes. They have no properties or
   methods beyond base IR nodes for now. */
struct BaseStmtNode : public IRNode {
    BaseStmtNode(IRNodeType t)
        : IRNode(t) {
    }
    virtual Stmt mutate_stmt(IRMutator *v) const = 0;
};

/** A base class for expression nodes. They all contain their types
 * (e.g. Int(32), Float(32)) */
struct BaseExprNode : public IRNode {
    BaseExprNode(IRNodeType t)
        : IRNode(t) {
    }
    virtual Expr mutate_expr(IRMutator *v) const = 0;
    Type type;
};

/** We use the "curiously recurring template pattern" to avoid
   duplicated code in the IR Nodes. These classes live between the
   abstract base classes and the actual IR Nodes in the
   inheritance hierarchy. It provides an implementation of the
   accept function necessary for the visitor pattern to work, and
   a concrete instantiation of a unique IRNodeType per class. */
template<typename T>
struct ExprNode : public BaseExprNode {
    void accept(IRVisitor *v) const override;
    Expr mutate_expr(IRMutator *v) const override;
    ExprNode()
        : BaseExprNode(T::_node_type) {
    }
    ~ExprNode() override = default;
};

template<typename T>
struct StmtNode : public BaseStmtNode {
    void accept(IRVisitor *v) const override;
    Stmt mutate_stmt(IRMutator *v) const override;
    StmtNode()
        : BaseStmtNode(T::_node_type) {
    }
    ~StmtNode() override = default;
};

/** IR nodes are passed around opaque handles to them. This is a
   base class for those handles. It manages the reference count,
   and dispatches visitors. */
struct IRHandle : public IntrusivePtr<const IRNode> {
    HALIDE_ALWAYS_INLINE
    IRHandle() = default;

    HALIDE_ALWAYS_INLINE
    IRHandle(const IRNode *p)
        : IntrusivePtr<const IRNode>(p) {
    }

    /** Dispatch to the correct visitor method for this node. E.g. if
     * this node is actually an Add node, then this will call
     * IRVisitor::visit(const Add *) */
    void accept(IRVisitor *v) const {
        ptr->accept(v);
    }

    /** Downcast this ir node to its actual type (e.g. Add, or
     * Select). This returns nullptr if the node is not of the requested
     * type. Example usage:
     *
     * if (const Add *add = node->as<Add>()) {
     *   // This is an add node
     * }
     */
    template<typename T>
    const T *as() const {
        if (ptr && ptr->node_type == T::_node_type) {
            return (const T *)ptr;
        }
        return nullptr;
    }

    IRNodeType node_type() const {
        return ptr->node_type;
    }
};

/** Integer constants */
struct IntImm : public ExprNode<IntImm> {
    int64_t value;

    static const IntImm *make(Type t, int64_t value);

    static const IRNodeType _node_type = IRNodeType::IntImm;
};

/** Unsigned integer constants */
struct UIntImm : public ExprNode<UIntImm> {
    uint64_t value;

    static const UIntImm *make(Type t, uint64_t value);

    static const IRNodeType _node_type = IRNodeType::UIntImm;
};

/** Floating point constants */
struct FloatImm : public ExprNode<FloatImm> {
    double value;

    static const FloatImm *make(Type t, double value);

    static const IRNodeType _node_type = IRNodeType::FloatImm;
};

/** String constants */
struct StringImm : public ExprNode<StringImm> {
    std::string value;

    static const StringImm *make(const std::string &val);

    static const IRNodeType _node_type = IRNodeType::StringImm;
};

}  // namespace Internal

/** A fragment of Halide syntax. It's implemented as reference-counted
 * handle to a concrete expression node, but it's immutable, so you
 * can treat it as a value type. */
struct Expr : public Internal::IRHandle {
    /** Make an undefined expression */
    HALIDE_ALWAYS_INLINE
    Expr() = default;

    /** Make an expression from a concrete expression node pointer (e.g. Add) */
    HALIDE_ALWAYS_INLINE
    Expr(const Internal::BaseExprNode *n)
        : IRHandle(n) {
    }

    /** Make an expression representing numeric constants of various types. */
    // @{
    explicit Expr(int8_t x)
        : IRHandle(Internal::IntImm::make(Int(8), x)) {
    }
    explicit Expr(int16_t x)
        : IRHandle(Internal::IntImm::make(Int(16), x)) {
    }
    Expr(int32_t x)
        : IRHandle(Internal::IntImm::make(Int(32), x)) {
    }
    explicit Expr(int64_t x)
        : IRHandle(Internal::IntImm::make(Int(64), x)) {
    }
    explicit Expr(uint8_t x)
        : IRHandle(Internal::UIntImm::make(UInt(8), x)) {
    }
    explicit Expr(uint16_t x)
        : IRHandle(Internal::UIntImm::make(UInt(16), x)) {
    }
    explicit Expr(uint32_t x)
        : IRHandle(Internal::UIntImm::make(UInt(32), x)) {
    }
    explicit Expr(uint64_t x)
        : IRHandle(Internal::UIntImm::make(UInt(64), x)) {
    }
    Expr(float16_t x)
        : IRHandle(Internal::FloatImm::make(Float(16), (double)x)) {
    }
    Expr(bfloat16_t x)
        : IRHandle(Internal::FloatImm::make(BFloat(16), (double)x)) {
    }
#ifdef HALIDE_CPP_COMPILER_HAS_FLOAT16
    explicit Expr(_Float16 x)
        : IRHandle(Internal::FloatImm::make(Float(16), (double)x)) {
    }
#endif
    Expr(float x)
        : IRHandle(Internal::FloatImm::make(Float(32), x)) {
    }
    explicit Expr(double x)
        : IRHandle(Internal::FloatImm::make(Float(64), x)) {
    }
    // @}

    /** Make an expression representing a const string (i.e. a StringImm) */
    Expr(const std::string &s)
        : IRHandle(Internal::StringImm::make(s)) {
    }

    /** Override get() to return a BaseExprNode * instead of an IRNode * */
    HALIDE_ALWAYS_INLINE
    const Internal::BaseExprNode *get() const {
        return (const Internal::BaseExprNode *)ptr;
    }

    /** Get the type of this expression node */
    HALIDE_ALWAYS_INLINE
    Type type() const {
        return get()->type;
    }
};

/** This lets you use an Expr as a key in a map of the form
 * map<Expr, Foo, ExprCompare> */
struct ExprCompare {
    bool operator()(const Expr &a, const Expr &b) const {
        return a.get() < b.get();
    }
};

/** A single-dimensional span. Includes all numbers between min and
 * (min + extent - 1). */
struct Range {
    Expr min, extent;

    Range() = default;
    Range(const Expr &min_in, const Expr &extent_in);
};

/** A multi-dimensional box. The outer product of the elements */
typedef std::vector<Range> Region;

/** An enum describing different address spaces to be used with Func::store_in. */
enum class MemoryType {
    /** Let Halide select a storage type automatically */
    Auto,

    /** Heap/global memory. Allocated using halide_malloc, or
     * halide_device_malloc */
    Heap,

    /** Stack memory. Allocated using alloca. Requires a constant
     * size. Corresponds to per-thread local memory on the GPU. If all
     * accesses are at constant coordinates, may be promoted into the
     * register file at the discretion of the register allocator. */
    Stack,

    /** Register memory. The allocation should be promoted into the
     * register file. All stores must be at constant coordinates. May
     * be spilled to the stack at the discretion of the register
     * allocator. */
    Register,

    /** Allocation is stored in GPU shared memory. Also known as
     * "local" in OpenCL, and "threadgroup" in metal. Can be shared
     * across GPU threads within the same block. */
    GPUShared,

    /** Allocation is stored in GPU texture memory and accessed through
     * hardware sampler */
    GPUTexture,

    /** Allocate Locked Cache Memory to act as local memory */
    LockedCache,
    /** Vector Tightly Coupled Memory. HVX (Hexagon) local memory available on
     * v65+. This memory has higher performance and lower power. Ideal for
     * intermediate buffers. Necessary for vgather-vscatter instructions
     * on Hexagon */
    VTCM,

    /** AMX Tile register for X86. Any data that would be used in an AMX matrix
     * multiplication must first be loaded into an AMX tile register. */
    AMXTile,
};

namespace Internal {

/** An enum describing a type of loop traversal. Used in schedules,
 * and in the For loop IR node. Serial is a conventional ordered for
 * loop. Iterations occur in increasing order, and each iteration must
 * appear to have finished before the next begins. Parallel, GPUBlock,
 * and GPUThread are parallel and unordered: iterations may occur in
 * any order, and multiple iterations may occur
 * simultaneously. Vectorized and GPULane are parallel and
 * synchronous: they act as if all iterations occur at the same time
 * in lockstep. */
enum class ForType {
    Serial,
    Parallel,
    Vectorized,
    Unrolled,
    Extern,
    GPUBlock,
    GPUThread,
    GPULane,
};

/** Check if for_type executes for loop iterations in parallel and unordered. */
bool is_unordered_parallel(ForType for_type);

/** Returns true if for_type executes for loop iterations in parallel. */
bool is_parallel(ForType for_type);

/** Returns true if for_type is GPUBlock, GPUThread, or GPULane. */
bool is_gpu(ForType for_type);

/** A reference-counted handle to a statement node. */
struct Stmt : public IRHandle {
    Stmt() = default;
    Stmt(const BaseStmtNode *n)
        : IRHandle(n) {
    }

    /** Override get() to return a BaseStmtNode * instead of an IRNode * */
    HALIDE_ALWAYS_INLINE
    const BaseStmtNode *get() const {
        return (const Internal::BaseStmtNode *)ptr;
    }

    /** This lets you use a Stmt as a key in a map of the form
     * map<Stmt, Foo, Stmt::Compare> */
    struct Compare {
        bool operator()(const Stmt &a, const Stmt &b) const {
            return a.ptr < b.ptr;
        }
    };
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_TARGET_H
#define HALIDE_TARGET_H

/** \file
 * Defines the structure that describes a Halide target.
 */

#include <bitset>
#include <cstdint>
#include <string>

#ifndef HALIDE_DEVICEAPI_H
#define HALIDE_DEVICEAPI_H

/** \file
 * Defines DeviceAPI.
 */

#include <string>
#include <vector>

namespace Halide {

/** An enum describing a type of device API. Used by schedules, and in
 * the For loop IR node. */
enum class DeviceAPI {
    None,  /// Used to denote for loops that run on the same device as the containing code.
    Host,
    Default_GPU,
    CUDA,
    OpenCL,
    Metal,
    Hexagon,
    HexagonDma,
    D3D12Compute,
    Vulkan,
    WebGPU,
};

/** An array containing all the device apis. Useful for iterating
 * through them. */
const DeviceAPI all_device_apis[] = {DeviceAPI::None,
                                     DeviceAPI::Host,
                                     DeviceAPI::Default_GPU,
                                     DeviceAPI::CUDA,
                                     DeviceAPI::OpenCL,
                                     DeviceAPI::Metal,
                                     DeviceAPI::Hexagon,
                                     DeviceAPI::HexagonDma,
                                     DeviceAPI::D3D12Compute,
                                     DeviceAPI::Vulkan,
                                     DeviceAPI::WebGPU};

}  // namespace Halide

#endif  // HALIDE_DEVICEAPI_H

namespace Halide {

/** A struct representing a target machine and os to generate code for. */
struct Target {
    /** The operating system used by the target. Determines which
     * system calls to generate.
     * Corresponds to os_name_map in Target.cpp. */
    enum OS {
        OSUnknown = 0,
        Linux,
        Windows,
        OSX,
        Android,
        IOS,
        QuRT,
        NoOS,
        Fuchsia,
        WebAssemblyRuntime
    } os = OSUnknown;

    /** The architecture used by the target. Determines the
     * instruction set to use.
     * Corresponds to arch_name_map in Target.cpp. */
    enum Arch {
        ArchUnknown = 0,
        X86,
        ARM,
        Hexagon,
        POWERPC,
        WebAssembly,
        RISCV
    } arch = ArchUnknown;

    /** The bit-width of the target machine. Must be 0 for unknown, or 32 or 64. */
    int bits = 0;

    /** The bit-width of a vector register for targets where this is configurable and
     * targeting a fixed size is desired. The default of 0 indicates no assumption of
     * fixed size is allowed. */
    int vector_bits = 0;

    /** The specific processor to be targeted, tuned for.
     * Corresponds to processor_name_map in Target.cpp.
     *
     * New entries should be added to the end. */
    enum Processor {
        /// Do not tune for any specific CPU. In practice, this means that halide will decide the tune CPU based on the enabled features.
        ProcessorGeneric = 0,
        K8,        /// Tune for AMD K8 Hammer CPU (AMD Family 0Fh, launched 2003).
        K8_SSE3,   /// Tune for later versions of AMD K8 CPU, with SSE3 support.
        AMDFam10,  /// Tune for AMD K10 "Barcelona" CPU (AMD Family 10h, launched 2007).
        BtVer1,    /// Tune for AMD Bobcat CPU (AMD Family 14h, launched 2011).
        BdVer1,    /// Tune for AMD Bulldozer CPU (AMD Family 15h, launched 2011).
        BdVer2,    /// Tune for AMD Piledriver CPU (AMD Family 15h (2nd-gen), launched 2012).
        BdVer3,    /// Tune for AMD Steamroller CPU (AMD Family 15h (3nd-gen), launched 2014).
        BdVer4,    /// Tune for AMD Excavator CPU (AMD Family 15h (4th-gen), launched 2015).
        BtVer2,    /// Tune for AMD Jaguar CPU (AMD Family 16h, launched 2013).
        ZnVer1,    /// Tune for AMD Zen   CPU (AMD Family 17h, launched 2017).
        ZnVer2,    /// Tune for AMD Zen 2 CPU (AMD Family 17h, launched 2019).
        ZnVer3,    /// Tune for AMD Zen 3 CPU (AMD Family 19h, launched 2020).
        ZnVer4,    /// Tune for AMD Zen 4 CPU (AMD Family 19h, launched 2022).
        ZnVer5,    /// Tune for AMD Zen 5 CPU (AMD Family 1Ah, launched 2024).
    } processor_tune = ProcessorGeneric;

    /** Optional features a target can have.
     * Corresponds to feature_name_map in Target.cpp.
     * See definitions in HalideRuntime.h for full information.
     */
    enum Feature {
        JIT = halide_target_feature_jit,
        Debug = halide_target_feature_debug,
        EnableBacktraces = halide_target_feature_enable_backtraces,
        NoAsserts = halide_target_feature_no_asserts,
        NoBoundsQuery = halide_target_feature_no_bounds_query,
        SSE41 = halide_target_feature_sse41,
        AVX = halide_target_feature_avx,
        AVX2 = halide_target_feature_avx2,
        AVXVNNI = halide_target_feature_avxvnni,
        FMA = halide_target_feature_fma,
        FMA4 = halide_target_feature_fma4,
        F16C = halide_target_feature_f16c,
        ARMv7s = halide_target_feature_armv7s,
        NoNEON = halide_target_feature_no_neon,
        VSX = halide_target_feature_vsx,
        POWER_ARCH_2_07 = halide_target_feature_power_arch_2_07,
        CUDA = halide_target_feature_cuda,
        CUDACapability30 = halide_target_feature_cuda_capability30,
        CUDACapability32 = halide_target_feature_cuda_capability32,
        CUDACapability35 = halide_target_feature_cuda_capability35,
        CUDACapability50 = halide_target_feature_cuda_capability50,
        CUDACapability61 = halide_target_feature_cuda_capability61,
        CUDACapability70 = halide_target_feature_cuda_capability70,
        CUDACapability75 = halide_target_feature_cuda_capability75,
        CUDACapability80 = halide_target_feature_cuda_capability80,
        CUDACapability86 = halide_target_feature_cuda_capability86,
        OpenCL = halide_target_feature_opencl,
        CLDoubles = halide_target_feature_cl_doubles,
        CLHalf = halide_target_feature_cl_half,
        CLAtomics64 = halide_target_feature_cl_atomic64,
        EGL = halide_target_feature_egl,
        UserContext = halide_target_feature_user_context,
        Profile = halide_target_feature_profile,
        NoRuntime = halide_target_feature_no_runtime,
        Metal = halide_target_feature_metal,
        CPlusPlusMangling = halide_target_feature_c_plus_plus_mangling,
        LargeBuffers = halide_target_feature_large_buffers,
        HexagonDma = halide_target_feature_hexagon_dma,
        HVX_128 = halide_target_feature_hvx_128,
        HVX = HVX_128,
        HVX_v62 = halide_target_feature_hvx_v62,
        HVX_v65 = halide_target_feature_hvx_v65,
        HVX_v66 = halide_target_feature_hvx_v66,
        HVX_v68 = halide_target_feature_hvx_v68,
        FuzzFloatStores = halide_target_feature_fuzz_float_stores,
        SoftFloatABI = halide_target_feature_soft_float_abi,
        MSAN = halide_target_feature_msan,
        AVX512 = halide_target_feature_avx512,
        AVX512_KNL = halide_target_feature_avx512_knl,
        AVX512_Skylake = halide_target_feature_avx512_skylake,
        AVX512_Cannonlake = halide_target_feature_avx512_cannonlake,
        AVX512_SapphireRapids = halide_target_feature_avx512_sapphirerapids,
        AVX512_Zen4 = halide_target_feature_avx512_zen4,
        AVX512_Zen5 = halide_target_feature_avx512_zen5,
        TraceLoads = halide_target_feature_trace_loads,
        TraceStores = halide_target_feature_trace_stores,
        TraceRealizations = halide_target_feature_trace_realizations,
        TracePipeline = halide_target_feature_trace_pipeline,
        D3D12Compute = halide_target_feature_d3d12compute,
        StrictFloat = halide_target_feature_strict_float,
        TSAN = halide_target_feature_tsan,
        ASAN = halide_target_feature_asan,
        CheckUnsafePromises = halide_target_feature_check_unsafe_promises,
        EmbedBitcode = halide_target_feature_embed_bitcode,
        EnableLLVMLoopOpt = halide_target_feature_enable_llvm_loop_opt,
        WasmMvpOnly = halide_target_feature_wasm_mvponly,
        WasmSimd128 = halide_target_feature_wasm_simd128,
        WasmThreads = halide_target_feature_wasm_threads,
        WasmBulkMemory = halide_target_feature_wasm_bulk_memory,
        WebGPU = halide_target_feature_webgpu,
        SVE = halide_target_feature_sve,
        SVE2 = halide_target_feature_sve2,
        ARMDotProd = halide_target_feature_arm_dot_prod,
        ARMFp16 = halide_target_feature_arm_fp16,
        LLVMLargeCodeModel = halide_llvm_large_code_model,
        RVV = halide_target_feature_rvv,
        ARMv8a = halide_target_feature_armv8a,
        ARMv81a = halide_target_feature_armv81a,
        ARMv82a = halide_target_feature_armv82a,
        ARMv83a = halide_target_feature_armv83a,
        ARMv84a = halide_target_feature_armv84a,
        ARMv85a = halide_target_feature_armv85a,
        ARMv86a = halide_target_feature_armv86a,
        ARMv87a = halide_target_feature_armv87a,
        ARMv88a = halide_target_feature_armv88a,
        ARMv89a = halide_target_feature_armv89a,
        SanitizerCoverage = halide_target_feature_sanitizer_coverage,
        ProfileByTimer = halide_target_feature_profile_by_timer,
        SPIRV = halide_target_feature_spirv,
        Vulkan = halide_target_feature_vulkan,
        VulkanInt8 = halide_target_feature_vulkan_int8,
        VulkanInt16 = halide_target_feature_vulkan_int16,
        VulkanInt64 = halide_target_feature_vulkan_int64,
        VulkanFloat16 = halide_target_feature_vulkan_float16,
        VulkanFloat64 = halide_target_feature_vulkan_float64,
        VulkanV10 = halide_target_feature_vulkan_version10,
        VulkanV12 = halide_target_feature_vulkan_version12,
        VulkanV13 = halide_target_feature_vulkan_version13,
        Semihosting = halide_target_feature_semihosting,
        AVX10_1 = halide_target_feature_avx10_1,
        X86APX = halide_target_feature_x86_apx,
        Simulator = halide_target_feature_simulator,
        FeatureEnd = halide_target_feature_end
    };
    Target() = default;
    Target(OS o, Arch a, int b, Processor pt, const std::vector<Feature> &initial_features = std::vector<Feature>(),
           int vb = 0)
        : os(o), arch(a), bits(b), vector_bits(vb), processor_tune(pt) {
        for (const auto &f : initial_features) {
            set_feature(f);
        }
        validate_features();
    }

    Target(OS o, Arch a, int b, const std::vector<Feature> &initial_features = std::vector<Feature>())
        : Target(o, a, b, ProcessorGeneric, initial_features) {
    }

    /** Given a string of the form used in HL_TARGET
     * (e.g. "x86-64-avx"), construct the Target it specifies. Note
     * that this always starts with the result of get_host_target(),
     * replacing only the parts found in the target string, so if you
     * omit (say) an OS specification, the host OS will be used
     * instead. An empty string is exactly equivalent to
     * get_host_target().
     *
     * Invalid target strings will fail with a user_error.
     */
    // @{
    explicit Target(const std::string &s);
    explicit Target(const char *s);
    // @}

    /** Check if a target string is valid. */
    static bool validate_target_string(const std::string &s);

    /** Return true if any of the arch/bits/os fields are "unknown"/0;
        return false otherwise. */
    bool has_unknowns() const;

    void set_feature(Feature f, bool value = true);

    void set_features(const std::vector<Feature> &features_to_set, bool value = true);

    bool has_feature(Feature f) const;

    bool has_feature(halide_target_feature_t f) const {
        return has_feature((Feature)f);
    }

    bool features_any_of(const std::vector<Feature> &test_features) const;

    bool features_all_of(const std::vector<Feature> &test_features) const;

    /** Return a copy of the target with the given feature set.
     * This is convenient when enabling certain features (e.g. NoBoundsQuery)
     * in an initialization list, where the target to be mutated may be
     * a const reference. */
    Target with_feature(Feature f) const;

    /** Return a copy of the target with the given feature cleared.
     * This is convenient when disabling certain features (e.g. NoBoundsQuery)
     * in an initialization list, where the target to be mutated may be
     * a const reference. */
    Target without_feature(Feature f) const;

    /** Is a fully feature GPU compute runtime enabled? I.e. is
     * Func::gpu_tile and similar going to work? Currently includes
     * CUDA, OpenCL, Metal and D3D12Compute. */
    bool has_gpu_feature() const;

    /** Does this target allow using a certain type. Generally all
     * types except 64-bit float and int/uint should be supported by
     * all backends.
     *
     * It is likely better to call the version below which takes a DeviceAPI.
     */
    bool supports_type(const Type &t) const;

    /** Does this target allow using a certain type on a certain device.
     * This is the prefered version of this routine.
     */
    bool supports_type(const Type &t, DeviceAPI device) const;

    /** Returns whether a particular device API can be used with this
     * Target. */
    bool supports_device_api(DeviceAPI api) const;

    /** If this Target (including all Features) requires a specific DeviceAPI,
     * return it. If it doesn't, return DeviceAPI::None.  If the Target has
     * features with multiple (different) DeviceAPI requirements, the result
     * will be an arbitrary DeviceAPI. */
    DeviceAPI get_required_device_api() const;

    bool operator==(const Target &other) const {
        return os == other.os &&
               arch == other.arch &&
               bits == other.bits &&
               processor_tune == other.processor_tune &&
               features == other.features;
    }

    bool operator!=(const Target &other) const {
        return !(*this == other);
    }

    /**
     * Create a "greatest common denominator" runtime target that is compatible with
     * both this target and \p other. Used by generators to conveniently select a suitable
     * runtime when linking together multiple functions.
     *
     * @param other The other target from which we compute the gcd target.
     * @param[out] result The gcd target if we return true, otherwise unmodified. Can be the same as *this.
     * @return Whether it was possible to find a compatible target (true) or not.
     */
    bool get_runtime_compatible_target(const Target &other, Target &result);

    /** Convert the Target into a string form that can be reconstituted
     * by merge_string(), which will always be of the form
     *
     *   arch-bits-os-processor-feature1-feature2...featureN.
     *
     * Note that is guaranteed that Target(t1.to_string()) == t1,
     * but not that Target(s).to_string() == s (since there can be
     * multiple strings that parse to the same Target)...
     * *unless* t1 contains 'unknown' fields (in which case you'll get a string
     * that can't be parsed, which is intentional).
     */
    std::string to_string() const;

    /** Given a data type, return an estimate of the "natural" vector size
     * for that data type when compiling for this Target. */
    int natural_vector_size(const Halide::Type &t) const;

    /** Given a data type, return an estimate of the "natural" vector size
     * for that data type when compiling for this Target. */
    template<typename data_t>
    int natural_vector_size() const {
        return natural_vector_size(type_of<data_t>());
    }

    /** Return true iff 64 bits and has_feature(LargeBuffers). */
    bool has_large_buffers() const {
        return bits == 64 && has_feature(LargeBuffers);
    }

    /** Return the maximum buffer size in bytes supported on this
     * Target. This is 2^31 - 1 except on 64-bit targets when the LargeBuffers
     * feature is enabled, which expands the maximum to 2^63 - 1. */
    int64_t maximum_buffer_size() const {
        if (has_large_buffers()) {
            return (((uint64_t)1) << 63) - 1;
        } else {
            return (((uint64_t)1) << 31) - 1;
        }
    }

    /** Get the minimum cuda capability found as an integer. Returns
     * 20 (our minimum supported cuda compute capability) if no cuda
     * features are set. */
    int get_cuda_capability_lower_bound() const;

    /** Get the minimum Vulkan capability found as an integer. Returns
     * 10 (our minimum supported Vulkan compute capability) if no Vulkan
     * features are set. */
    int get_vulkan_capability_lower_bound() const;

    /** Get the minimum ARM v8.x capability found as an integer. Returns
     * -1 if no ARM v8.x features are set. */
    int get_arm_v8_lower_bound() const;

    /** Was libHalide compiled with support for this target? */
    bool supported() const;

    /** Return a bitset of the Featuress set in this Target (set = 1).
     * Note that while this happens to be the current internal representation,
     * that might not always be the case. */
    const std::bitset<FeatureEnd> &get_features_bitset() const {
        return features;
    }

    /** Return the name corresponding to a given Feature, in the form
     * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug"). */
    static std::string feature_to_name(Target::Feature feature);

    /** Return the feature corresponding to a given name, in the form
     * used to construct Target strings (e.g., Feature::Debug is "debug" and not "Debug").
     * If the string is not a known feature name, return FeatureEnd. */
    static Target::Feature feature_from_name(const std::string &name);

private:
    /** A bitmask that stores the active features. */
    std::bitset<FeatureEnd> features;

    /** Attempt to validate that all features set are sensible for the base Target.
     * This is *not* guaranteed to get all invalid combinations, but is intended
     * to catch at least the most common (e.g., setting arm-specific features on x86). */
    void validate_features() const;
};

/** Return the target corresponding to the host machine. */
Target get_host_target();

/** Return the target that Halide will use. If HL_TARGET is set it
 * uses that. Otherwise calls \ref get_host_target */
Target get_target_from_environment();

/** Return the target that Halide will use for jit-compilation. If
 * HL_JIT_TARGET is set it uses that. Otherwise calls \ref
 * get_host_target. Throws an error if the architecture, bit width,
 * and OS of the target do not match the host target, so this is only
 * useful for controlling the feature set. */
Target get_jit_target_from_environment();

/** Get the Target feature corresponding to a DeviceAPI. For device
 * apis that do not correspond to any single target feature, returns
 * Target::FeatureEnd */
Target::Feature target_feature_for_device_api(DeviceAPI api);

namespace Internal {

void target_test();
}

}  // namespace Halide

#endif

namespace Halide {

/** Gets the appropriate halide_device_interface_t * for a
 * DeviceAPI. If error_site is non-null, e.g. the name of the routine
 * calling get_device_interface_for_device_api, a user_error is
 * reported if the requested device API is not enabled in or supported
 * by the target, Halide has been compiled without this device API, or
 * the device API is None or Host or a bad value. The error_site
 * argument is printed in the error message. If error_site is null,
 * this routine returns nullptr instead of calling user_error. */
const halide_device_interface_t *get_device_interface_for_device_api(DeviceAPI d,
                                                                     const Target &t = get_jit_target_from_environment(),
                                                                     const char *error_site = nullptr);

/** Get the specific DeviceAPI that Halide would select when presented
 * with DeviceAPI::Default_GPU for a given target. If no suitable api
 * is enabled in the target, returns DeviceAPI::Host. */
DeviceAPI get_default_device_api_for_target(const Target &t);

/** This attempts to sniff whether a given Target (and its implied DeviceAPI) is usable on
 * the current host. If it appears to be usable, return true; if not, return false.
 * Note that a return value of true does *not* guarantee that future usage of
 * that device will succeed; it is intended mainly as a simple diagnostic
 * to allow early-exit when a desired device is definitely not usable.
 * Also note that this call is *NOT* threadsafe, as it temporarily redirect various
 * global error-handling hooks in Halide. */
bool host_supports_target_device(const Target &t);

namespace Internal {
/** Get an Expr which evaluates to the device interface for the given device api at runtime. */
Expr make_device_interface_call(DeviceAPI device_api, MemoryType memory_type = MemoryType::Auto);
}  // namespace Internal

}  // namespace Halide

#endif
/** \file
 * Defines a Buffer type that wraps from halide_buffer_t and adds
 * functionality, and methods for more conveniently iterating over the
 * samples in a halide_buffer_t outside of Halide code. */

#ifndef HALIDE_RUNTIME_BUFFER_H
#define HALIDE_RUNTIME_BUFFER_H

#include <algorithm>
#include <atomic>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <memory>
#include <type_traits>
#include <vector>

#ifdef __APPLE__
#include <AvailabilityVersions.h>
#include <TargetConditionals.h>
#endif

#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
#include <sanitizer/msan_interface.h>
#endif
#endif


#ifdef _MSC_VER
#include <malloc.h>
#define HALIDE_ALLOCA _alloca
#else
#define HALIDE_ALLOCA __builtin_alloca
#endif

// gcc 5.1 has a false positive warning on this code
#if __GNUC__ == 5 && __GNUC_MINOR__ == 1
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif

#ifndef HALIDE_RUNTIME_BUFFER_CHECK_INDICES
#define HALIDE_RUNTIME_BUFFER_CHECK_INDICES 0
#endif

#ifndef HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT
// Conservatively align buffer allocations to 128 bytes by default.
// This is enough alignment for all the platforms currently in use.
// Redefine this in your compiler settings if you desire more/less alignment.
#define HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT 128
#endif

static_assert(((HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT & (HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT - 1)) == 0),
              "HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT must be a power of 2.");

// Unfortunately, not all C++17 runtimes support aligned_alloc
// (it may depends on OS/SDK version); this is provided as an opt-out
// if you are compiling on a platform that doesn't provide a (good)
// implementation. (Note that we actually use the C11 `::aligned_alloc()`
// rather than the C++17 `std::aligned_alloc()` because at least one platform
// we found supports the former but not the latter.)
#ifndef HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC

#ifdef _WIN32

// Windows (regardless of which compiler) doesn't implement aligned_alloc(),
// even in C++17 mode, and has stated they probably never will, as the issue
// is in the incompatibility that free() needs to be able to free both pointers
// returned by malloc() and aligned_alloc(). So, always default it off here.
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#elif defined(__ANDROID_API__) && __ANDROID_API__ < 28

// Android doesn't provide aligned_alloc until API 28
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#elif defined(__APPLE__)

#if TARGET_OS_OSX && (__MAC_OS_X_VERSION_MIN_REQUIRED < __MAC_10_15)

// macOS doesn't provide aligned_alloc until 10.15
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#elif TARGET_OS_IPHONE && (__IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_14_0)

// iOS doesn't provide aligned_alloc until 14.0
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#else

// Assume it's ok on all other Apple targets
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1

#endif

#else

#if defined(__GLIBCXX__) && !defined(_GLIBCXX_HAVE_ALIGNED_ALLOC)

// ARM GNU-A baremetal compiler doesn't provide aligned_alloc as of 12.2
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 0

#else

// Not Windows, Android, or Apple: just assume it's ok
#define HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC 1

#endif

#endif

#endif  // HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC

namespace Halide {
namespace Runtime {

// Forward-declare our Buffer class
template<typename T, int Dims, int InClassDimStorage>
class Buffer;

// A helper to check if a parameter pack is entirely implicitly
// int-convertible to use with std::enable_if
template<typename... Args>
struct AllInts : std::false_type {};

template<>
struct AllInts<> : std::true_type {};

template<typename T, typename... Args>
struct AllInts<T, Args...> {
    static const bool value = std::is_convertible_v<T, int> && AllInts<Args...>::value;
};

// Floats and doubles are technically implicitly int-convertible, but
// doing so produces a warning we treat as an error, so just disallow
// it here.
template<typename... Args>
struct AllInts<float, Args...> : std::false_type {};

template<typename... Args>
struct AllInts<double, Args...> : std::false_type {};

namespace Internal {
// A helper to detect if there are any zeros in a container
template<typename Container>
bool any_zero(const Container &c) {
    for (int i : c) {
        if (i == 0) {
            return true;
        }
    }
    return false;
}

struct DefaultAllocatorFns {
    static inline void *(*default_allocate_fn)(size_t) = nullptr;
    static inline void (*default_deallocate_fn)(void *) = nullptr;
};
}  // namespace Internal

/** A struct acting as a header for allocations owned by the Buffer
 * class itself. */
struct AllocationHeader {
    void (*deallocate_fn)(void *);
    std::atomic<int> ref_count;

    // Note that ref_count always starts at 1
    explicit AllocationHeader(void (*deallocate_fn)(void *))
        : deallocate_fn(deallocate_fn), ref_count(1) {
    }
};

/** This indicates how to deallocate the device for a Halide::Runtime::Buffer. */
enum struct BufferDeviceOwnership : int {
    Allocated,               ///> halide_device_free will be called when device ref count goes to zero
    WrappedNative,           ///> halide_device_detach_native will be called when device ref count goes to zero
    Unmanaged,               ///> No free routine will be called when device ref count goes to zero
    AllocatedDeviceAndHost,  ///> Call device_and_host_free when DevRefCount goes to zero.
    Cropped,                 ///> Call halide_device_release_crop when DevRefCount goes to zero.
};

/** A similar struct for managing device allocations. */
struct DeviceRefCount {
    // This is only ever constructed when there's something to manage,
    // so start at one.
    std::atomic<int> count{1};
    BufferDeviceOwnership ownership{BufferDeviceOwnership::Allocated};
};

constexpr int AnyDims = -1;

/** A templated Buffer class that wraps halide_buffer_t and adds
 * functionality. When using Halide from C++, this is the preferred
 * way to create input and output buffers. The overhead of using this
 * class relative to a naked halide_buffer_t is minimal - it uses another
 * ~16 bytes on the stack, and does no dynamic allocations when using
 * it to represent existing memory of a known maximum dimensionality.
 *
 * The template parameter T is the element type. For buffers where the
 * element type is unknown, or may vary, use void or const void.
 *
 * The template parameter Dims is the number of dimensions. For buffers where
 * the dimensionality type is unknown at, or may vary, use AnyDims.
 *
 * InClassDimStorage is the maximum number of dimensions that can be represented
 * using space inside the class itself. Set it to the maximum dimensionality
 * you expect this buffer to be. If the actual dimensionality exceeds
 * this, heap storage is allocated to track the shape of the buffer.
 * InClassDimStorage defaults to 4, which should cover nearly all usage.
 *
 * The class optionally allocates and owns memory for the image using
 * a shared pointer allocated with the provided allocator. If they are
 * null, malloc and free are used.  Any device-side allocation is
 * considered as owned if and only if the host-side allocation is
 * owned. */
template<typename T = void,
         int Dims = AnyDims,
         int InClassDimStorage = (Dims == AnyDims ? 4 : std::max(Dims, 1))>
class Buffer {
    /** The underlying halide_buffer_t */
    halide_buffer_t buf = {};

    /** Some in-class storage for shape of the dimensions. */
    halide_dimension_t shape[InClassDimStorage];

    /** The allocation owned by this Buffer. NULL if the Buffer does not
     * own the memory. */
    AllocationHeader *alloc = nullptr;

    /** A reference count for the device allocation owned by this
     * buffer. */
    mutable DeviceRefCount *dev_ref_count = nullptr;

    /** True if T is of type void or const void */
    static const bool T_is_void = std::is_same_v<std::remove_const_t<T>, void>;

    /** A type function that adds a const qualifier if T is a const type. */
    template<typename T2>
    using add_const_if_T_is_const = std::conditional_t<std::is_const_v<T>, const T2, T2>;

    /** T unless T is (const) void, in which case (const)
     * uint8_t. Useful for providing return types for operator() */
    using not_void_T = std::conditional_t<T_is_void,
                                          add_const_if_T_is_const<uint8_t>,
                                          T>;

    /** T with constness removed. Useful for return type of copy(). */
    using not_const_T = std::remove_const_t<T>;

    /** The type the elements are stored as. Equal to not_void_T
     * unless T is a pointer, in which case uint64_t. Halide stores
     * all pointer types as uint64s internally, even on 32-bit
     * systems. */
    using storage_T = std::conditional_t<std::is_pointer_v<T>, uint64_t, not_void_T>;

public:
    /** True if the Halide type is not void (or const void). */
    static constexpr bool has_static_halide_type = !T_is_void;

    /** Get the Halide type of T. Callers should not use the result if
     * has_static_halide_type is false. */
    static constexpr halide_type_t static_halide_type() {
        return halide_type_of<std::remove_cv_t<not_void_T>>();
    }

    /** Does this Buffer own the host memory it refers to? */
    bool owns_host_memory() const {
        return alloc != nullptr;
    }

    static constexpr bool has_static_dimensions = (Dims != AnyDims);

    /** Callers should not use the result if
     * has_static_dimensions is false. */
    static constexpr int static_dimensions() {
        return Dims;
    }

    static_assert(!has_static_dimensions || static_dimensions() >= 0);

private:
    /** Increment the reference count of any owned allocation */
    void incref() const {
        if (owns_host_memory()) {
            alloc->ref_count++;
        }
        if (buf.device) {
            if (!dev_ref_count) {
                // I seem to have a non-zero dev field but no
                // reference count for it. I must have been given a
                // device allocation by a Halide pipeline, and have
                // never been copied from since. Take sole ownership
                // of it.
                dev_ref_count = new DeviceRefCount;
            }
            dev_ref_count->count++;
        }
    }

    // Note that this is called "cropped" but can also encompass a slice/embed
    // operation as well.
    struct DevRefCountCropped : DeviceRefCount {
        // We will only store Buffers that have a dynamic number of dimensions.
        // Buffers that cropped or sliced from need to be first converted to
        // one with variable size. This is required because we cannot possibly
        // know what the actual dimensionality is of the buffer this is a
        // crop or slice from. Since cropping a sliced buffer is also possible,
        // no optimizations can be made for cropped buffers either.
        Buffer<T, AnyDims> cropped_from;
        explicit DevRefCountCropped(const Buffer<T, AnyDims> &cropped_from)
            : cropped_from(cropped_from) {
            ownership = BufferDeviceOwnership::Cropped;
        }
    };

    /** Setup the device ref count for a buffer to indicate it is a crop (or slice, embed, etc) of cropped_from */
    void crop_from(const Buffer<T, AnyDims> &cropped_from) {
        assert(dev_ref_count == nullptr);
        dev_ref_count = new DevRefCountCropped(cropped_from);
    }

    /** Decrement the reference count of any owned allocation and free host
     * and device memory if it hits zero. Sets alloc to nullptr. */
    void decref(bool device_only = false) {
        if (owns_host_memory() && !device_only) {
            int new_count = --(alloc->ref_count);
            if (new_count == 0) {
                void (*fn)(void *) = alloc->deallocate_fn;
                alloc->~AllocationHeader();
                fn(alloc);
            }
            buf.host = nullptr;
            alloc = nullptr;
            set_host_dirty(false);
        }
        int new_count = 0;
        if (dev_ref_count) {
            new_count = --(dev_ref_count->count);
        }
        if (new_count == 0) {
            if (buf.device) {
                assert(!(alloc && device_dirty()) &&
                       "Implicitly freeing a dirty device allocation while a host allocation still lives. "
                       "Call device_free explicitly if you want to drop dirty device-side data. "
                       "Call copy_to_host explicitly if you want the data copied to the host allocation "
                       "before the device allocation is freed.");
                int result = halide_error_code_success;
                if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative) {
                    result = buf.device_interface->detach_native(nullptr, &buf);
                } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost) {
                    result = buf.device_interface->device_and_host_free(nullptr, &buf);
                } else if (dev_ref_count && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                    result = buf.device_interface->device_release_crop(nullptr, &buf);
                } else if (dev_ref_count == nullptr || dev_ref_count->ownership == BufferDeviceOwnership::Allocated) {
                    result = buf.device_interface->device_free(nullptr, &buf);
                }
                // No reasonable way to return the error, but we can at least assert-fail in debug builds.
                assert((result == halide_error_code_success) && "device_interface call returned a nonzero result in Buffer::decref()");
                (void)result;
            }
            if (dev_ref_count) {
                if (dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                    delete (DevRefCountCropped *)dev_ref_count;
                } else {
                    delete dev_ref_count;
                }
            }
        }
        dev_ref_count = nullptr;
        buf.device = 0;
        buf.device_interface = nullptr;
    }

    void free_shape_storage() {
        if (buf.dim != shape) {
            delete[] buf.dim;
            buf.dim = nullptr;
        }
    }

    template<int DimsSpecified>
    void make_static_shape_storage() {
        static_assert(Dims == AnyDims || Dims == DimsSpecified,
                      "Number of arguments to Buffer() does not match static dimensionality");
        buf.dimensions = DimsSpecified;
        if constexpr (Dims == AnyDims) {
            if constexpr (DimsSpecified <= InClassDimStorage) {
                buf.dim = shape;
            } else {
                static_assert(DimsSpecified >= 1);
                buf.dim = new halide_dimension_t[DimsSpecified];
            }
        } else {
            static_assert(InClassDimStorage >= Dims);
            buf.dim = shape;
        }
    }

    void make_shape_storage(const int dimensions) {
        if (Dims != AnyDims && Dims != dimensions) {
            assert(false && "Number of arguments to Buffer() does not match static dimensionality");
        }
        // This should usually be inlined, so if dimensions is statically known,
        // we can skip the call to new
        buf.dimensions = dimensions;
        buf.dim = (dimensions <= InClassDimStorage) ? shape : new halide_dimension_t[dimensions];
    }

    void copy_shape_from(const halide_buffer_t &other) {
        // All callers of this ensure that buf.dimensions == other.dimensions.
        make_shape_storage(other.dimensions);
        std::copy(other.dim, other.dim + other.dimensions, buf.dim);
    }

    template<typename T2, int D2, int S2>
    void move_shape_from(Buffer<T2, D2, S2> &&other) {
        if (other.shape == other.buf.dim) {
            copy_shape_from(other.buf);
        } else {
            buf.dim = other.buf.dim;
            other.buf.dim = nullptr;
        }
        other.buf = halide_buffer_t();
    }

    /** Initialize the shape from a halide_buffer_t. */
    void initialize_from_buffer(const halide_buffer_t &b,
                                BufferDeviceOwnership ownership) {
        memcpy(&buf, &b, sizeof(halide_buffer_t));
        copy_shape_from(b);
        if (b.device) {
            dev_ref_count = new DeviceRefCount;
            dev_ref_count->ownership = ownership;
        }
    }

    /** Initialize the shape from an array of ints */
    void initialize_shape(const int *sizes) {
        for (int i = 0; i < buf.dimensions; i++) {
            buf.dim[i].min = 0;
            buf.dim[i].extent = sizes[i];
            if (i == 0) {
                buf.dim[i].stride = 1;
            } else {
                buf.dim[i].stride = buf.dim[i - 1].stride * buf.dim[i - 1].extent;
            }
        }
    }

    /** Initialize the shape from a vector of extents */
    void initialize_shape(const std::vector<int> &sizes) {
        assert(buf.dimensions == (int)sizes.size());
        initialize_shape(sizes.data());
    }

    /** Initialize the shape from the static shape of an array */
    template<typename Array, size_t N>
    void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
        buf.dim[next].min = 0;
        buf.dim[next].extent = (int)N;
        if (next == 0) {
            buf.dim[next].stride = 1;
        } else {
            initialize_shape_from_array_shape(next - 1, vals[0]);
            buf.dim[next].stride = buf.dim[next - 1].stride * buf.dim[next - 1].extent;
        }
    }

    /** Base case for the template recursion above. */
    template<typename T2>
    void initialize_shape_from_array_shape(int, const T2 &) {
    }

    /** Get the dimensionality of a multi-dimensional C array */
    template<typename Array, size_t N>
    static int dimensionality_of_array(Array (&vals)[N]) {
        return dimensionality_of_array(vals[0]) + 1;
    }

    template<typename T2>
    static int dimensionality_of_array(const T2 &) {
        return 0;
    }

    /** Get the underlying halide_type_t of an array's element type. */
    template<typename Array, size_t N>
    static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
        return scalar_type_of_array(vals[0]);
    }

    template<typename T2>
    static halide_type_t scalar_type_of_array(const T2 &) {
        return halide_type_of<std::remove_cv_t<T2>>();
    }

    /** Crop a single dimension without handling device allocation. */
    void crop_host(int d, int min, int extent) {
        assert(dim(d).min() <= min);
        assert(dim(d).max() >= min + extent - 1);
        ptrdiff_t shift = min - dim(d).min();
        if (buf.host != nullptr) {
            buf.host += (shift * dim(d).stride()) * type().bytes();
        }
        buf.dim[d].min = min;
        buf.dim[d].extent = extent;
    }

    /** Crop as many dimensions as are in rect, without handling device allocation. */
    void crop_host(const std::vector<std::pair<int, int>> &rect) {
        assert(rect.size() <= static_cast<decltype(rect.size())>(std::numeric_limits<int>::max()));
        int limit = (int)rect.size();
        assert(limit <= dimensions());
        for (int i = 0; i < limit; i++) {
            crop_host(i, rect[i].first, rect[i].second);
        }
    }

    void complete_device_crop(Buffer<T, Dims, InClassDimStorage> &result_host_cropped) const {
        assert(buf.device_interface != nullptr);
        if (buf.device_interface->device_crop(nullptr, &this->buf, &result_host_cropped.buf) == halide_error_code_success) {
            // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
            // is it possible to get to this point without incref having run at least once since
            // the device field was set? (I.e. in the internal logic of crop. incref might have been
            // called.)
            if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                result_host_cropped.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
            } else {
                result_host_cropped.crop_from(*this);
            }
        }
    }

    /** slice a single dimension without handling device allocation. */
    void slice_host(int d, int pos) {
        static_assert(Dims == AnyDims);
        assert(dimensions() > 0);
        assert(d >= 0 && d < dimensions());
        assert(pos >= dim(d).min() && pos <= dim(d).max());
        buf.dimensions--;
        ptrdiff_t shift = pos - buf.dim[d].min;
        if (buf.host != nullptr) {
            buf.host += (shift * buf.dim[d].stride) * type().bytes();
        }
        for (int i = d; i < buf.dimensions; i++) {
            buf.dim[i] = buf.dim[i + 1];
        }
        buf.dim[buf.dimensions] = {0, 0, 0};
    }

    void complete_device_slice(Buffer<T, AnyDims, InClassDimStorage> &result_host_sliced, int d, int pos) const {
        assert(buf.device_interface != nullptr);
        if (buf.device_interface->device_slice(nullptr, &this->buf, d, pos, &result_host_sliced.buf) == halide_error_code_success) {
            // TODO: Figure out what to do if dev_ref_count is nullptr. Should incref logic run here?
            // is it possible to get to this point without incref having run at least once since
            // the device field was set? (I.e. in the internal logic of slice. incref might have been
            // called.)
            if (dev_ref_count != nullptr && dev_ref_count->ownership == BufferDeviceOwnership::Cropped) {
                // crop_from() is correct here, despite the fact that we are slicing.
                result_host_sliced.crop_from(((DevRefCountCropped *)dev_ref_count)->cropped_from);
            } else {
                // crop_from() is correct here, despite the fact that we are slicing.
                result_host_sliced.crop_from(*this);
            }
        }
    }

public:
    typedef T ElemType;

    /** Read-only access to the shape */
    class Dimension {
        const halide_dimension_t &d;

    public:
        /** The lowest coordinate in this dimension */
        HALIDE_ALWAYS_INLINE int min() const {
            return d.min;
        }

        /** The number of elements in memory you have to step over to
         * increment this coordinate by one. */
        HALIDE_ALWAYS_INLINE int stride() const {
            return d.stride;
        }

        /** The extent of the image along this dimension */
        HALIDE_ALWAYS_INLINE int extent() const {
            return d.extent;
        }

        /** The highest coordinate in this dimension */
        HALIDE_ALWAYS_INLINE int max() const {
            return min() + extent() - 1;
        }

        /** An iterator class, so that you can iterate over
         * coordinates in a dimensions using a range-based for loop. */
        struct iterator {
            int val;
            int operator*() const {
                return val;
            }
            bool operator!=(const iterator &other) const {
                return val != other.val;
            }
            iterator &operator++() {
                val++;
                return *this;
            }
        };

        /** An iterator that points to the min coordinate */
        HALIDE_ALWAYS_INLINE iterator begin() const {
            return {min()};
        }

        /** An iterator that points to one past the max coordinate */
        HALIDE_ALWAYS_INLINE iterator end() const {
            return {min() + extent()};
        }

        explicit Dimension(const halide_dimension_t &dim)
            : d(dim) {
        }
    };

    /** Access the shape of the buffer */
    HALIDE_ALWAYS_INLINE Dimension dim(int i) const {
        assert(i >= 0 && i < this->dimensions());
        return Dimension(buf.dim[i]);
    }

    /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
    // @{
    int min(int i) const {
        return dim(i).min();
    }
    int extent(int i) const {
        return dim(i).extent();
    }
    int stride(int i) const {
        return dim(i).stride();
    }
    // @}

    /** The total number of elements this buffer represents. Equal to
     * the product of the extents */
    size_t number_of_elements() const {
        return buf.number_of_elements();
    }

    /** Get the dimensionality of the buffer. */
    int dimensions() const {
        if constexpr (has_static_dimensions) {
            return Dims;
        } else {
            return buf.dimensions;
        }
    }

    /** Get the type of the elements. */
    halide_type_t type() const {
        return buf.type;
    }

    /** A pointer to the element with the lowest address. If all
     * strides are positive, equal to the host pointer. */
    T *begin() const {
        assert(buf.host != nullptr);  // Cannot call begin() on an unallocated Buffer.
        return (T *)buf.begin();
    }

    /** A pointer to one beyond the element with the highest address. */
    T *end() const {
        assert(buf.host != nullptr);  // Cannot call end() on an unallocated Buffer.
        return (T *)buf.end();
    }

    /** The total number of bytes spanned by the data in memory. */
    size_t size_in_bytes() const {
        return buf.size_in_bytes();
    }

    /** Reset the Buffer to be equivalent to a default-constructed Buffer
     * of the same static type (if any); Buffer<void> will have its runtime
     * type reset to uint8. */
    void reset() {
        *this = Buffer();
    }

    Buffer()
        : shape() {
        buf.type = static_halide_type();
        // If Dims are statically known, must create storage that many.
        // otherwise, make a zero-dimensional buffer.
        constexpr int buf_dimensions = (Dims == AnyDims) ? 0 : Dims;
        make_static_shape_storage<buf_dimensions>();
    }

    /** Make a Buffer from a halide_buffer_t */
    explicit Buffer(const halide_buffer_t &buf,
                    BufferDeviceOwnership ownership = BufferDeviceOwnership::Unmanaged) {
        assert(T_is_void || buf.type == static_halide_type());
        initialize_from_buffer(buf, ownership);
    }

    /** Give Buffers access to the members of Buffers of different dimensionalities and types. */
    template<typename T2, int D2, int S2>
    friend class Buffer;

private:
    template<typename T2, int D2, int S2>
    static void static_assert_can_convert_from() {
        static_assert((!std::is_const_v<T2> || std::is_const_v<T>),
                      "Can't convert from a Buffer<const T> to a Buffer<T>");
        static_assert(std::is_same_v<std::remove_const_t<T>, std::remove_const_t<T2>> ||
                          T_is_void || Buffer<T2, D2, S2>::T_is_void,
                      "type mismatch constructing Buffer");
        static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
                      "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
    }

public:
    static void set_default_allocate_fn(void *(*allocate_fn)(size_t)) {
        Internal::DefaultAllocatorFns::default_allocate_fn = allocate_fn;
    }
    static void set_default_deallocate_fn(void (*deallocate_fn)(void *)) {
        Internal::DefaultAllocatorFns::default_deallocate_fn = deallocate_fn;
    }

    /** Determine if a Buffer<T, Dims, InClassDimStorage> can be constructed from some other Buffer type.
     * If this can be determined at compile time, fail with a static assert; otherwise
     * return a boolean based on runtime typing. */
    template<typename T2, int D2, int S2>
    static bool can_convert_from(const Buffer<T2, D2, S2> &other) {
        static_assert_can_convert_from<T2, D2, S2>();
        if (Buffer<T2, D2, S2>::T_is_void && !T_is_void) {
            if (other.type() != static_halide_type()) {
                return false;
            }
        }
        if (Dims != AnyDims) {
            if (other.dimensions() != Dims) {
                return false;
            }
        }
        return true;
    }

    /** Fail an assertion at runtime or compile-time if an Buffer<T, Dims, InClassDimStorage>
     * cannot be constructed from some other Buffer type. */
    template<typename T2, int D2, int S2>
    static void assert_can_convert_from(const Buffer<T2, D2, S2> &other) {
        // Explicitly call static_assert_can_convert_from() here so
        // that we always get compile-time checking, even if compiling with
        // assertions disabled.
        static_assert_can_convert_from<T2, D2, S2>();
        assert(can_convert_from(other));
    }

    /** Copy constructor. Does not copy underlying data. */
    Buffer(const Buffer<T, Dims, InClassDimStorage> &other)
        : buf(other.buf),
          alloc(other.alloc) {
        other.incref();
        dev_ref_count = other.dev_ref_count;
        copy_shape_from(other.buf);
    }

    /** Construct a Buffer from a Buffer of different dimensionality
     * and type. Asserts that the type and dimensionality matches (at runtime,
     * if one of the types is void). Note that this constructor is
     * implicit. This, for example, lets you pass things like
     * Buffer<T> or Buffer<const void> to functions expected
     * Buffer<const T>. */
    template<typename T2, int D2, int S2>
    Buffer(const Buffer<T2, D2, S2> &other)
        : buf(other.buf),
          alloc(other.alloc) {
        assert_can_convert_from(other);
        other.incref();
        dev_ref_count = other.dev_ref_count;
        copy_shape_from(other.buf);
    }

    /** Move constructor */
    Buffer(Buffer<T, Dims, InClassDimStorage> &&other) noexcept
        : buf(other.buf),
          alloc(other.alloc),
          dev_ref_count(other.dev_ref_count) {
        other.dev_ref_count = nullptr;
        other.alloc = nullptr;
        move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
    }

    /** Move-construct a Buffer from a Buffer of different
     * dimensionality and type. Asserts that the types match (at
     * runtime if one of the types is void). */
    template<typename T2, int D2, int S2>
    Buffer(Buffer<T2, D2, S2> &&other)
        : buf(other.buf),
          alloc(other.alloc),
          dev_ref_count(other.dev_ref_count) {
        assert_can_convert_from(other);
        other.dev_ref_count = nullptr;
        other.alloc = nullptr;
        move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
    }

    /** Assign from another Buffer of possibly-different
     * dimensionality and type. Asserts that the types match (at
     * runtime if one of the types is void). */
    template<typename T2, int D2, int S2>
    Buffer<T, Dims, InClassDimStorage> &operator=(const Buffer<T2, D2, S2> &other) {
        if ((const void *)this == (const void *)&other) {
            return *this;
        }
        assert_can_convert_from(other);
        other.incref();
        decref();
        dev_ref_count = other.dev_ref_count;
        alloc = other.alloc;
        free_shape_storage();
        buf = other.buf;
        copy_shape_from(other.buf);
        return *this;
    }

    /** Standard assignment operator */
    Buffer<T, Dims, InClassDimStorage> &operator=(const Buffer<T, Dims, InClassDimStorage> &other) {
        // The cast to void* here is just to satisfy clang-tidy
        if ((const void *)this == (const void *)&other) {
            return *this;
        }
        other.incref();
        decref();
        dev_ref_count = other.dev_ref_count;
        alloc = other.alloc;
        free_shape_storage();
        buf = other.buf;
        copy_shape_from(other.buf);
        return *this;
    }

    /** Move from another Buffer of possibly-different
     * dimensionality and type. Asserts that the types match (at
     * runtime if one of the types is void). */
    template<typename T2, int D2, int S2>
    Buffer<T, Dims, InClassDimStorage> &operator=(Buffer<T2, D2, S2> &&other) {
        assert_can_convert_from(other);
        decref();
        alloc = other.alloc;
        other.alloc = nullptr;
        dev_ref_count = other.dev_ref_count;
        other.dev_ref_count = nullptr;
        free_shape_storage();
        buf = other.buf;
        move_shape_from(std::forward<Buffer<T2, D2, S2>>(other));
        return *this;
    }

    /** Standard move-assignment operator */
    Buffer<T, Dims, InClassDimStorage> &operator=(Buffer<T, Dims, InClassDimStorage> &&other) noexcept {
        decref();
        alloc = other.alloc;
        other.alloc = nullptr;
        dev_ref_count = other.dev_ref_count;
        other.dev_ref_count = nullptr;
        free_shape_storage();
        buf = other.buf;
        move_shape_from(std::forward<Buffer<T, Dims, InClassDimStorage>>(other));
        return *this;
    }

    /** Check the product of the extents fits in memory. */
    void check_overflow() {
        size_t size = type().bytes();
        for (int i = 0; i < dimensions(); i++) {
            size *= dim(i).extent();
        }
        // We allow 2^31 or 2^63 bytes, so drop the top bit.
        size = (size << 1) >> 1;
        for (int i = 0; i < dimensions(); i++) {
            size /= dim(i).extent();
        }
        assert(size == (size_t)type().bytes() && "Error: Overflow computing total size of buffer.");
    }

    /** Allocate memory for this Buffer. Drops the reference to any
     * owned memory. */
    void allocate(void *(*allocate_fn)(size_t) = nullptr,
                  void (*deallocate_fn)(void *) = nullptr) {
        // Drop any existing allocation
        deallocate();

        // Conservatively align images to (usually) 128 bytes. This is enough
        // alignment for all the platforms we might use. Also ensure that the allocation
        // is such that the logical size is an integral multiple of 128 bytes (or a bit more).
        constexpr size_t alignment = HALIDE_RUNTIME_BUFFER_ALLOCATION_ALIGNMENT;

        const auto align_up = [=](size_t value) -> size_t {
            return (value + alignment - 1) & ~(alignment - 1);
        };

        size_t size = size_in_bytes();

#if HALIDE_RUNTIME_BUFFER_USE_ALIGNED_ALLOC
        // Only use aligned_alloc() if no custom allocators are specified.
        if (!allocate_fn && !deallocate_fn && !Internal::DefaultAllocatorFns::default_allocate_fn && !Internal::DefaultAllocatorFns::default_deallocate_fn) {
            // As a practical matter, sizeof(AllocationHeader) is going to be no more than 16 bytes
            // on any supported platform, so we will just overallocate by 'alignment'
            // so that the user storage also starts at an aligned point. This is a bit
            // wasteful, but probably not a big deal.
            static_assert(sizeof(AllocationHeader) <= alignment);
            void *alloc_storage = ::aligned_alloc(alignment, align_up(size) + alignment);
            assert((uintptr_t)alloc_storage == align_up((uintptr_t)alloc_storage));
            alloc = new (alloc_storage) AllocationHeader(free);
            buf.host = (uint8_t *)((uintptr_t)alloc_storage + alignment);
            return;
        }
        // else fall thru
#endif
        if (!allocate_fn) {
            allocate_fn = Internal::DefaultAllocatorFns::default_allocate_fn;
            if (!allocate_fn) {
                allocate_fn = malloc;
            }
        }
        if (!deallocate_fn) {
            deallocate_fn = Internal::DefaultAllocatorFns::default_deallocate_fn;
            if (!deallocate_fn) {
                deallocate_fn = free;
            }
        }

        static_assert(sizeof(AllocationHeader) <= alignment);

        // malloc() and friends must return a pointer aligned to at least alignof(std::max_align_t);
        // make sure this is OK for AllocationHeader, since it always goes at the start
        static_assert(alignof(AllocationHeader) <= alignof(std::max_align_t));

        const size_t requested_size = align_up(size + alignment +
                                               std::max(0, (int)sizeof(AllocationHeader) -
                                                               (int)sizeof(std::max_align_t)));
        void *alloc_storage = allocate_fn(requested_size);
        alloc = new (alloc_storage) AllocationHeader(deallocate_fn);
        uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
        buf.host = (uint8_t *)align_up((uintptr_t)unaligned_ptr);
    }

    /** Drop reference to any owned host or device memory, possibly
     * freeing it, if this buffer held the last reference to
     * it. Retains the shape of the buffer. Does nothing if this
     * buffer did not allocate its own memory. */
    void deallocate() {
        decref();
    }

    /** Drop reference to any owned device memory, possibly freeing it
     * if this buffer held the last reference to it. Asserts that
     * device_dirty is false. */
    void device_deallocate() {
        decref(true);
    }

    /** Allocate a new image of the given size with a runtime
     * type. Only used when you do know what size you want but you
     * don't know statically what type the elements are. Pass zeros
     * to make a buffer suitable for bounds query calls. */
    template<typename... Args,
             typename = std::enable_if_t<AllInts<Args...>::value>>
    Buffer(halide_type_t t, int first, Args... rest) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        int extents[] = {first, (int)rest...};
        buf.type = t;
        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
        make_static_shape_storage<buf_dimensions>();
        initialize_shape(extents);
        if (!Internal::any_zero(extents)) {
            check_overflow();
            allocate();
        }
    }

    /** Allocate a new image of the given size. Pass zeros to make a
     * buffer suitable for bounds query calls. */
    // @{

    // The overload with one argument is 'explicit', so that
    // (say) int is not implicitly convertible to Buffer<int>
    explicit Buffer(int first) {
        static_assert(!T_is_void,
                      "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
        int extents[] = {first};
        buf.type = static_halide_type();
        constexpr int buf_dimensions = 1;
        make_static_shape_storage<buf_dimensions>();
        initialize_shape(extents);
        if (first != 0) {
            check_overflow();
            allocate();
        }
    }

    template<typename... Args,
             typename = std::enable_if_t<AllInts<Args...>::value>>
    Buffer(int first, int second, Args... rest) {
        static_assert(!T_is_void,
                      "To construct an Buffer<void>, pass a halide_type_t as the first argument to the constructor");
        int extents[] = {first, second, (int)rest...};
        buf.type = static_halide_type();
        constexpr int buf_dimensions = 2 + (int)(sizeof...(rest));
        make_static_shape_storage<buf_dimensions>();
        initialize_shape(extents);
        if (!Internal::any_zero(extents)) {
            check_overflow();
            allocate();
        }
    }
    // @}

    /** Allocate a new image of unknown type using a vector of ints as the size. */
    Buffer(halide_type_t t, const std::vector<int> &sizes) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        buf.type = t;
        // make_shape_storage() will do a runtime check that dimensionality matches.
        make_shape_storage((int)sizes.size());
        initialize_shape(sizes);
        if (!Internal::any_zero(sizes)) {
            check_overflow();
            allocate();
        }
    }

    /** Allocate a new image of known type using a vector of ints as the size. */
    explicit Buffer(const std::vector<int> &sizes)
        : Buffer(static_halide_type(), sizes) {
    }

private:
    // Create a copy of the sizes vector, ordered as specified by order.
    static std::vector<int> make_ordered_sizes(const std::vector<int> &sizes, const std::vector<int> &order) {
        assert(order.size() == sizes.size());
        std::vector<int> ordered_sizes(sizes.size());
        for (size_t i = 0; i < sizes.size(); ++i) {
            ordered_sizes[i] = sizes.at(order[i]);
        }
        return ordered_sizes;
    }

public:
    /** Allocate a new image of unknown type using a vector of ints as the size and
     * a vector of indices indicating the storage order for each dimension. The
     * length of the sizes vector and the storage-order vector must match. For instance,
     * to allocate an interleaved RGB buffer, you would pass {2, 0, 1} for storage_order. */
    Buffer(halide_type_t t, const std::vector<int> &sizes, const std::vector<int> &storage_order)
        : Buffer(t, make_ordered_sizes(sizes, storage_order)) {
        transpose(storage_order);
    }

    Buffer(const std::vector<int> &sizes, const std::vector<int> &storage_order)
        : Buffer(static_halide_type(), sizes, storage_order) {
    }

    /** Make an Buffer that refers to a statically sized array. Does not
     * take ownership of the data, and does not set the host_dirty flag. */
    template<typename Array, size_t N>
    explicit Buffer(Array (&vals)[N]) {
        const int buf_dimensions = dimensionality_of_array(vals);
        buf.type = scalar_type_of_array(vals);
        buf.host = (uint8_t *)vals;
        make_shape_storage(buf_dimensions);
        initialize_shape_from_array_shape(buf.dimensions - 1, vals);
    }

    /** Initialize an Buffer of runtime type from a pointer and some
     * sizes. Assumes dense row-major packing and a min coordinate of
     * zero. Does not take ownership of the data and does not set the
     * host_dirty flag. */
    template<typename... Args,
             typename = std::enable_if_t<AllInts<Args...>::value>>
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int first, Args &&...rest) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        int extents[] = {first, (int)rest...};
        buf.type = t;
        buf.host = (uint8_t *)const_cast<void *>(data);
        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
        make_static_shape_storage<buf_dimensions>();
        initialize_shape(extents);
    }

    /** Initialize an Buffer from a pointer and some sizes. Assumes
     * dense row-major packing and a min coordinate of zero. Does not
     * take ownership of the data and does not set the host_dirty flag. */
    template<typename... Args,
             typename = std::enable_if_t<AllInts<Args...>::value>>
    explicit Buffer(T *data, int first, Args &&...rest) {
        int extents[] = {first, (int)rest...};
        buf.type = static_halide_type();
        buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
        constexpr int buf_dimensions = 1 + (int)(sizeof...(rest));
        make_static_shape_storage<buf_dimensions>();
        initialize_shape(extents);
    }

    /** Initialize an Buffer from a pointer and a vector of
     * sizes. Assumes dense row-major packing and a min coordinate of
     * zero. Does not take ownership of the data and does not set the
     * host_dirty flag. */
    explicit Buffer(T *data, const std::vector<int> &sizes) {
        buf.type = static_halide_type();
        buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
        make_shape_storage((int)sizes.size());
        initialize_shape(sizes);
    }

    /** Initialize an Buffer of runtime type from a pointer and a
     * vector of sizes. Assumes dense row-major packing and a min
     * coordinate of zero. Does not take ownership of the data and
     * does not set the host_dirty flag. */
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, const std::vector<int> &sizes) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        buf.type = t;
        buf.host = (uint8_t *)const_cast<void *>(data);
        make_shape_storage((int)sizes.size());
        initialize_shape(sizes);
    }

    /** Initialize an Buffer from a pointer to the min coordinate and
     * an array describing the shape.  Does not take ownership of the
     * data, and does not set the host_dirty flag. */
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data, int d, const halide_dimension_t *shape) {
        if (!T_is_void) {
            assert(static_halide_type() == t);
        }
        buf.type = t;
        buf.host = (uint8_t *)const_cast<void *>(data);
        make_shape_storage(d);
        for (int i = 0; i < d; i++) {
            buf.dim[i] = shape[i];
        }
    }

    /** Initialize a Buffer from a pointer to the min coordinate and
     * a vector describing the shape.  Does not take ownership of the
     * data, and does not set the host_dirty flag. */
    explicit Buffer(halide_type_t t, add_const_if_T_is_const<void> *data,
                    const std::vector<halide_dimension_t> &shape)
        : Buffer(t, data, (int)shape.size(), shape.data()) {
    }

    /** Initialize an Buffer from a pointer to the min coordinate and
     * an array describing the shape.  Does not take ownership of the
     * data and does not set the host_dirty flag. */
    explicit Buffer(T *data, int d, const halide_dimension_t *shape) {
        buf.type = static_halide_type();
        buf.host = (uint8_t *)const_cast<std::remove_const_t<T> *>(data);
        make_shape_storage(d);
        for (int i = 0; i < d; i++) {
            buf.dim[i] = shape[i];
        }
    }

    /** Initialize a Buffer from a pointer to the min coordinate and
     * a vector describing the shape.  Does not take ownership of the
     * data, and does not set the host_dirty flag. */
    explicit Buffer(T *data, const std::vector<halide_dimension_t> &shape)
        : Buffer(data, (int)shape.size(), shape.data()) {
    }

    /** Destructor. Will release any underlying owned allocation if
     * this is the last reference to it. Will assert fail if there are
     * weak references to this Buffer outstanding. */
    ~Buffer() {
        decref();
        free_shape_storage();
    }

    /** Get a pointer to the raw halide_buffer_t this wraps. */
    // @{
    halide_buffer_t *raw_buffer() {
        return &buf;
    }

    const halide_buffer_t *raw_buffer() const {
        return &buf;
    }
    // @}

    /** Provide a cast operator to halide_buffer_t *, so that
     * instances can be passed directly to Halide filters. */
    operator halide_buffer_t *() {
        return &buf;
    }

    /** Return a typed reference to this Buffer. Useful for converting
     * a reference to a Buffer<void> to a reference to, for example, a
     * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
     * You can also optionally sspecify a new value for Dims; this is useful
     * mainly for removing the dimensionality constraint on a Buffer with
     * explicit dimensionality. Does a runtime assert if the source buffer type
     * is void or the new dimensionality is incompatible. */
    template<typename T2, int D2 = Dims>
    HALIDE_ALWAYS_INLINE Buffer<T2, D2, InClassDimStorage> &as() & {
        Buffer<T2, D2, InClassDimStorage>::assert_can_convert_from(*this);
        return *((Buffer<T2, D2, InClassDimStorage> *)this);
    }

    /** Return a const typed reference to this Buffer. Useful for converting
     * a reference to a Buffer<void> to a reference to, for example, a
     * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
     * You can also optionally sspecify a new value for Dims; this is useful
     * mainly for removing the dimensionality constraint on a Buffer with
     * explicit dimensionality. Does a runtime assert if the source buffer type
     * is void or the new dimensionality is incompatible. */
    template<typename T2, int D2 = Dims>
    HALIDE_ALWAYS_INLINE const Buffer<T2, D2, InClassDimStorage> &as() const & {
        Buffer<T2, D2, InClassDimStorage>::assert_can_convert_from(*this);
        return *((const Buffer<T2, D2, InClassDimStorage> *)this);
    }

    /** Return an rval reference to this Buffer. Useful for converting
     * a reference to a Buffer<void> to a reference to, for example, a
     * Buffer<const uint8_t>, or converting a Buffer<T>& to Buffer<const T>&.
     * You can also optionally sspecify a new value for Dims; this is useful
     * mainly for removing the dimensionality constraint on a Buffer with
     * explicit dimensionality. Does a runtime assert if the source buffer type
     * is void or the new dimensionality is incompatible. */
    template<typename T2, int D2 = Dims>
    HALIDE_ALWAYS_INLINE Buffer<T2, D2, InClassDimStorage> as() && {
        Buffer<T2, D2, InClassDimStorage>::assert_can_convert_from(*this);
        return *((Buffer<T2, D2, InClassDimStorage> *)this);
    }

    /** as_const() is syntactic sugar for .as<const T>(), to avoid the need
     * to recapitulate the type argument. */
    // @{
    HALIDE_ALWAYS_INLINE
    Buffer<std::add_const_t<T>, Dims, InClassDimStorage> &as_const() & {
        // Note that we can skip the assert_can_convert_from(), since T -> const T
        // conversion is always legal.
        return *reinterpret_cast<Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
    }

    HALIDE_ALWAYS_INLINE
    const Buffer<std::add_const_t<T>, Dims, InClassDimStorage> &as_const() const & {
        return *reinterpret_cast<const Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
    }

    HALIDE_ALWAYS_INLINE
    Buffer<std::add_const_t<T>, Dims, InClassDimStorage> as_const() && {
        return *reinterpret_cast<Buffer<std::add_const_t<T>, Dims, InClassDimStorage> *>(this);
    }
    // @}

    /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<const T>& when
     * passing arguments */
    template<typename T2 = T, typename = std::enable_if_t<!std::is_const_v<T2>>>
    operator Buffer<std::add_const_t<T2>, Dims, InClassDimStorage> &() & {
        return as_const();
    }

    /** Add some syntactic sugar to allow autoconversion from Buffer<T> to Buffer<void>& when
     * passing arguments */
    template<typename TVoid,
             typename T2 = T,
             typename = std::enable_if_t<std::is_same_v<TVoid, void> &&
                                         !std::is_void_v<T2> &&
                                         !std::is_const_v<T2>>>
    operator Buffer<TVoid, Dims, InClassDimStorage> &() & {
        return as<TVoid, Dims>();
    }

    /** Add some syntactic sugar to allow autoconversion from Buffer<const T> to Buffer<const void>& when
     * passing arguments */
    template<typename TVoid,
             typename T2 = T,
             typename = std::enable_if_t<std::is_same_v<TVoid, void> &&
                                         !std::is_void_v<T2> &&
                                         std::is_const_v<T2>>>
    operator Buffer<const TVoid, Dims, InClassDimStorage> &() & {
        return as<const TVoid, Dims>();
    }

    /** Conventional names for the first three dimensions. */
    // @{
    int width() const {
        return (dimensions() > 0) ? dim(0).extent() : 1;
    }
    int height() const {
        return (dimensions() > 1) ? dim(1).extent() : 1;
    }
    int channels() const {
        return (dimensions() > 2) ? dim(2).extent() : 1;
    }
    // @}

    /** Conventional names for the min and max value of each dimension */
    // @{
    int left() const {
        return dim(0).min();
    }

    int right() const {
        return dim(0).max();
    }

    int top() const {
        return dim(1).min();
    }

    int bottom() const {
        return dim(1).max();
    }
    // @}

    /** Make a new image which is a deep copy of this image. Use crop
     * or slice followed by copy to make a copy of only a portion of
     * the image. The new image has the same nesting order of dimensions
     * (e.g. channels innermost), but resets the strides to the default
     * (each stride is the product of the extents of the inner dimensions).
     * Note that this means any strides of zero get broadcast into a non-zero stride.
     *
     * Note that the returned Buffer is always of a non-const type T (ie:
     *
     *     Buffer<const T>.copy() -> Buffer<T> rather than Buffer<const T>
     *
     * which is always safe, since we are making a deep copy. (The caller
     * can easily cast it back to Buffer<const T> if desired, which is
     * always safe and free.)
     */
    Buffer<not_const_T, Dims, InClassDimStorage> copy(void *(*allocate_fn)(size_t) = nullptr,
                                                      void (*deallocate_fn)(void *) = nullptr) const {
        Buffer<not_const_T, Dims, InClassDimStorage> dst = Buffer<not_const_T, Dims, InClassDimStorage>::make_with_shape_of(*this, allocate_fn, deallocate_fn);
        dst.copy_from(*this);
        return dst;
    }

    /** Like copy(), but the copy is created in interleaved memory layout
     * (vs. keeping the same memory layout as the original). Requires that 'this'
     * has exactly 3 dimensions.
     */
    Buffer<not_const_T, Dims, InClassDimStorage> copy_to_interleaved(void *(*allocate_fn)(size_t) = nullptr,
                                                                     void (*deallocate_fn)(void *) = nullptr) const {
        static_assert(Dims == AnyDims || Dims == 3);
        assert(dimensions() == 3);
        Buffer<not_const_T, Dims, InClassDimStorage> dst = Buffer<not_const_T, Dims, InClassDimStorage>::make_interleaved(nullptr, width(), height(), channels());
        dst.set_min(min(0), min(1), min(2));
        dst.allocate(allocate_fn, deallocate_fn);
        dst.copy_from(*this);
        return dst;
    }

    /** Like copy(), but the copy is created in planar memory layout
     * (vs. keeping the same memory layout as the original).
     */
    Buffer<not_const_T, Dims, InClassDimStorage> copy_to_planar(void *(*allocate_fn)(size_t) = nullptr,
                                                                void (*deallocate_fn)(void *) = nullptr) const {
        std::vector<int> mins, extents;
        const int dims = dimensions();
        mins.reserve(dims);
        extents.reserve(dims);
        for (int d = 0; d < dims; ++d) {
            mins.push_back(dim(d).min());
            extents.push_back(dim(d).extent());
        }
        Buffer<not_const_T, Dims, InClassDimStorage> dst = Buffer<not_const_T, Dims, InClassDimStorage>(nullptr, extents);
        dst.set_min(mins);
        dst.allocate(allocate_fn, deallocate_fn);
        dst.copy_from(*this);
        return dst;
    }

    /** Make a copy of the Buffer which shares the underlying host and/or device
     * allocations as the existing Buffer. This is purely syntactic sugar for
     * cases where you have a const reference to a Buffer but need a temporary
     * non-const copy (e.g. to make a call into AOT-generated Halide code), and want a terse
     * inline way to create a temporary. \code
     * void call_my_func(const Buffer<const uint8_t>& input) {
     *     my_func(input.alias(), output);
     * }\endcode
     */
    Buffer<T, Dims, InClassDimStorage> alias() const {
        return *this;
    }

    /** Fill a Buffer with the values at the same coordinates in
     * another Buffer. Restricts itself to coordinates contained
     * within the intersection of the two buffers. If the two Buffers
     * are not in the same coordinate system, you will need to
     * translate the argument Buffer first. E.g. if you're blitting a
     * sprite onto a framebuffer, you'll want to translate the sprite
     * to the correct location first like so: \code
     * framebuffer.copy_from(sprite.translated({x, y})); \endcode
     */
    template<typename T2, int D2, int S2>
    void copy_from(Buffer<T2, D2, S2> src) {
        static_assert(!std::is_const_v<T>, "Cannot call copy_from() on a Buffer<const T>");
        assert(!device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty destination.");
        assert(!src.device_dirty() && "Cannot call Halide::Runtime::Buffer::copy_from on a device dirty source.");

        Buffer<T, Dims, InClassDimStorage> dst(*this);

        static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2);
        assert(src.dimensions() == dst.dimensions());

        // Trim the copy to the region in common
        const int d = dimensions();
        for (int i = 0; i < d; i++) {
            int min_coord = std::max(dst.dim(i).min(), src.dim(i).min());
            int max_coord = std::min(dst.dim(i).max(), src.dim(i).max());
            if (max_coord < min_coord) {
                // The buffers do not overlap.
                return;
            }
            dst.crop(i, min_coord, max_coord - min_coord + 1);
            src.crop(i, min_coord, max_coord - min_coord + 1);
        }

        // If T is void, we need to do runtime dispatch to an
        // appropriately-typed lambda. We're copying, so we only care
        // about the element size. (If not, this should optimize away
        // into a static dispatch to the right-sized copy.)
        if (T_is_void ? (type().bytes() == 1) : (sizeof(not_void_T) == 1)) {
            using MemType = uint8_t;
            auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
            auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else if (T_is_void ? (type().bytes() == 2) : (sizeof(not_void_T) == 2)) {
            using MemType = uint16_t;
            auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
            auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else if (T_is_void ? (type().bytes() == 4) : (sizeof(not_void_T) == 4)) {
            using MemType = uint32_t;
            auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
            auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else if (T_is_void ? (type().bytes() == 8) : (sizeof(not_void_T) == 8)) {
            using MemType = uint64_t;
            auto &typed_dst = reinterpret_cast<Buffer<MemType, Dims, InClassDimStorage> &>(dst);
            auto &typed_src = reinterpret_cast<Buffer<const MemType, D2, S2> &>(src);
            typed_dst.for_each_value([&](MemType &dst, MemType src) { dst = src; }, typed_src);
        } else {
            assert(false && "type().bytes() must be 1, 2, 4, or 8");
        }
        set_host_dirty();
    }

    /** Make an image that refers to a sub-range of this image along
     * the given dimension. Asserts that the crop region is within
     * the existing bounds: you cannot "crop outwards", even if you know there
     * is valid Buffer storage (e.g. because you already cropped inwards). */
    Buffer<T, Dims, InClassDimStorage> cropped(int d, int min, int extent) const {
        // Make a fresh copy of the underlying buffer (but not a fresh
        // copy of the allocation, if there is one).
        Buffer<T, Dims, InClassDimStorage> im = *this;

        // This guarantees the prexisting device ref is dropped if the
        // device_crop call fails and maintains the buffer in a consistent
        // state.
        im.device_deallocate();

        im.crop_host(d, min, extent);
        if (buf.device_interface != nullptr) {
            complete_device_crop(im);
        }
        return im;
    }

    /** Crop an image in-place along the given dimension. This does
     * not move any data around in memory - it just changes the min
     * and extent of the given dimension. */
    void crop(int d, int min, int extent) {
        // An optimization for non-device buffers. For the device case,
        // a temp buffer is required, so reuse the not-in-place version.
        // TODO(zalman|abadams): Are nop crops common enough to special
        // case the device part of the if to do nothing?
        if (buf.device_interface != nullptr) {
            *this = cropped(d, min, extent);
        } else {
            crop_host(d, min, extent);
        }
    }

    /** Make an image that refers to a sub-rectangle of this image along
     * the first N dimensions. Asserts that the crop region is within
     * the existing bounds. The cropped image may drop any device handle
     * if the device_interface cannot accomplish the crop in-place. */
    Buffer<T, Dims, InClassDimStorage> cropped(const std::vector<std::pair<int, int>> &rect) const {
        // Make a fresh copy of the underlying buffer (but not a fresh
        // copy of the allocation, if there is one).
        Buffer<T, Dims, InClassDimStorage> im = *this;

        // This guarantees the prexisting device ref is dropped if the
        // device_crop call fails and maintains the buffer in a consistent
        // state.
        im.device_deallocate();

        im.crop_host(rect);
        if (buf.device_interface != nullptr) {
            complete_device_crop(im);
        }
        return im;
    }

    /** Crop an image in-place along the first N dimensions. This does
     * not move any data around in memory, nor does it free memory. It
     * just rewrites the min/extent of each dimension to refer to a
     * subregion of the same allocation. */
    void crop(const std::vector<std::pair<int, int>> &rect) {
        // An optimization for non-device buffers. For the device case,
        // a temp buffer is required, so reuse the not-in-place version.
        // TODO(zalman|abadams): Are nop crops common enough to special
        // case the device part of the if to do nothing?
        if (buf.device_interface != nullptr) {
            *this = cropped(rect);
        } else {
            crop_host(rect);
        }
    }

    /** Make an image which refers to the same data with using
     * translated coordinates in the given dimension. Positive values
     * move the image data to the right or down relative to the
     * coordinate system. Drops any device handle. */
    Buffer<T, Dims, InClassDimStorage> translated(int d, int dx) const {
        Buffer<T, Dims, InClassDimStorage> im = *this;
        im.translate(d, dx);
        return im;
    }

    /** Translate an image in-place along one dimension by changing
     * how it is indexed. Does not move any data around in memory. */
    void translate(int d, int delta) {
        assert(d >= 0 && d < this->dimensions());
        device_deallocate();
        buf.dim[d].min += delta;
    }

    /** Make an image which refers to the same data translated along
     * the first N dimensions. */
    Buffer<T, Dims, InClassDimStorage> translated(const std::vector<int> &delta) const {
        Buffer<T, Dims, InClassDimStorage> im = *this;
        im.translate(delta);
        return im;
    }

    /** Translate an image along the first N dimensions by changing
     * how it is indexed. Does not move any data around in memory. */
    void translate(const std::vector<int> &delta) {
        device_deallocate();
        assert(delta.size() <= static_cast<decltype(delta.size())>(std::numeric_limits<int>::max()));
        int limit = (int)delta.size();
        assert(limit <= dimensions());
        for (int i = 0; i < limit; i++) {
            translate(i, delta[i]);
        }
    }

    /** Set the min coordinate of an image in the first N dimensions. */
    // @{
    void set_min(const std::vector<int> &mins) {
        assert(mins.size() <= static_cast<decltype(mins.size())>(dimensions()));
        device_deallocate();
        for (size_t i = 0; i < mins.size(); i++) {
            buf.dim[i].min = mins[i];
        }
    }

    template<typename... Args>
    void set_min(Args... args) {
        set_min(std::vector<int>{args...});
    }
    // @}

    /** Test if a given coordinate is within the bounds of an image. */
    // @{
    bool contains(const std::vector<int> &coords) const {
        assert(coords.size() <= static_cast<decltype(coords.size())>(dimensions()));
        for (size_t i = 0; i < coords.size(); i++) {
            if (coords[i] < dim((int)i).min() || coords[i] > dim((int)i).max()) {
                return false;
            }
        }
        return true;
    }

    template<typename... Args>
    bool contains(Args... args) const {
        return contains(std::vector<int>{args...});
    }
    // @}

    /** Make a buffer which refers to the same data in the same layout
     * using a swapped indexing order for the dimensions given. So
     * A = B.transposed(0, 1) means that A(i, j) == B(j, i), and more
     * strongly that A.address_of(i, j) == B.address_of(j, i). */
    Buffer<T, Dims, InClassDimStorage> transposed(int d1, int d2) const {
        Buffer<T, Dims, InClassDimStorage> im = *this;
        im.transpose(d1, d2);
        return im;
    }

    /** Transpose a buffer in-place by changing how it is indexed. For
     * example, transpose(0, 1) on a two-dimensional buffer means that
     * the value referred to by coordinates (i, j) is now reached at
     * the coordinates (j, i), and vice versa. This is done by
     * reordering the per-dimension metadata rather than by moving
     * data around in memory, so other views of the same memory will
     * not see the data as having been transposed. */
    void transpose(int d1, int d2) {
        assert(d1 >= 0 && d1 < this->dimensions());
        assert(d2 >= 0 && d2 < this->dimensions());
        std::swap(buf.dim[d1], buf.dim[d2]);
    }

    /** A generalized transpose: instead of swapping two dimensions,
     * pass a vector that lists each dimension index exactly once, in
     * the desired order. This does not move any data around in memory
     * - it just permutes how it is indexed. */
    void transpose(const std::vector<int> &order) {
        assert((int)order.size() == dimensions());
        if (dimensions() < 2) {
            // My, that was easy
            return;
        }

        std::vector<int> order_sorted = order;
        for (size_t i = 1; i < order_sorted.size(); i++) {
            for (size_t j = i; j > 0 && order_sorted[j - 1] > order_sorted[j]; j--) {
                std::swap(order_sorted[j], order_sorted[j - 1]);
                transpose(j, j - 1);
            }
        }
    }

    /** Make a buffer which refers to the same data in the same
     * layout using a different ordering of the dimensions. */
    Buffer<T, Dims, InClassDimStorage> transposed(const std::vector<int> &order) const {
        Buffer<T, Dims, InClassDimStorage> im = *this;
        im.transpose(order);
        return im;
    }

    /** Make a lower-dimensional buffer that refers to one slice of
     * this buffer. */
    Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
    sliced(int d, int pos) const {
        static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
        assert(dimensions() > 0);

        Buffer<T, AnyDims, InClassDimStorage> im = *this;

        // This guarantees the prexisting device ref is dropped if the
        // device_slice call fails and maintains the buffer in a consistent
        // state.
        im.device_deallocate();

        im.slice_host(d, pos);
        if (buf.device_interface != nullptr) {
            complete_device_slice(im, d, pos);
        }
        return im;
    }

    /** Make a lower-dimensional buffer that refers to one slice of this
     * buffer at the dimension's minimum. */
    Buffer<T, (Dims == AnyDims ? AnyDims : Dims - 1)>
    sliced(int d) const {
        static_assert(Dims == AnyDims || Dims > 0, "Cannot slice a 0-dimensional buffer");
        assert(dimensions() > 0);

        return sliced(d, dim(d).min());
    }

    /** Rewrite the buffer to refer to a single lower-dimensional
     * slice of itself along the given dimension at the given
     * coordinate. Does not move any data around or free the original
     * memory, so other views of the same data are unaffected. Can
     * only be called on a Buffer with dynamic dimensionality. */
    void slice(int d, int pos) {
        static_assert(Dims == AnyDims, "Cannot call slice() on a Buffer with static dimensionality.");
        assert(dimensions() > 0);

        // An optimization for non-device buffers. For the device case,
        // a temp buffer is required, so reuse the not-in-place version.
        // TODO(zalman|abadams): Are nop slices common enough to special
        // case the device part of the if to do nothing?
        if (buf.device_interface != nullptr) {
            *this = sliced(d, pos);
        } else {
            slice_host(d, pos);
        }
    }

    /** Slice a buffer in-place at the dimension's minimum. */
    void slice(int d) {
        slice(d, dim(d).min());
    }

    /** Make a new buffer that views this buffer as a single slice in a
     * higher-dimensional space. The new dimension has extent one and
     * the given min. This operation is the opposite of slice. As an
     * example, the following condition is true:
     *
     \code
     im2 = im.embedded(1, 17);
     &im(x, y, c) == &im2(x, 17, y, c);
     \endcode
     */
    Buffer<T, (Dims == AnyDims ? AnyDims : Dims + 1)>
    embedded(int d, int pos = 0) const {
        Buffer<T, AnyDims, InClassDimStorage> im(*this);
        im.embed(d, pos);
        return im;
    }

    /** Embed a buffer in-place, increasing the
     * dimensionality. */
    void embed(int d, int pos = 0) {
        static_assert(Dims == AnyDims, "Cannot call embed() on a Buffer with static dimensionality.");
        assert(d >= 0 && d <= dimensions());
        add_dimension();
        translate(dimensions() - 1, pos);
        for (int i = dimensions() - 1; i > d; i--) {
            transpose(i, i - 1);
        }
    }

    /** Add a new dimension with a min of zero and an extent of
     * one. The stride is the extent of the outermost dimension times
     * its stride. The new dimension is the last dimension. This is a
     * special case of embed. */
    void add_dimension() {
        static_assert(Dims == AnyDims, "Cannot call add_dimension() on a Buffer with static dimensionality.");
        const int dims = buf.dimensions;
        buf.dimensions++;
        if (buf.dim != shape) {
            // We're already on the heap. Reallocate.
            halide_dimension_t *new_shape = new halide_dimension_t[buf.dimensions];
            for (int i = 0; i < dims; i++) {
                new_shape[i] = buf.dim[i];
            }
            delete[] buf.dim;
            buf.dim = new_shape;
        } else if (dims == InClassDimStorage) {
            // Transition from the in-class storage to the heap
            make_shape_storage(buf.dimensions);
            for (int i = 0; i < dims; i++) {
                buf.dim[i] = shape[i];
            }
        } else {
            // We still fit in the class
        }
        buf.dim[dims] = {0, 1, 0};
        if (dims == 0) {
            buf.dim[dims].stride = 1;
        } else {
            buf.dim[dims].stride = buf.dim[dims - 1].extent * buf.dim[dims - 1].stride;
        }
    }

    /** Add a new dimension with a min of zero, an extent of one, and
     * the specified stride. The new dimension is the last
     * dimension. This is a special case of embed. */
    void add_dimension_with_stride(int s) {
        add_dimension();
        buf.dim[buf.dimensions - 1].stride = s;
    }

    /** Methods for managing any GPU allocation. */
    // @{
    // Set the host dirty flag. Called by every operator()
    // access. Must be inlined so it can be hoisted out of loops.
    HALIDE_ALWAYS_INLINE
    void set_host_dirty(bool v = true) {
        assert((!v || !device_dirty()) && "Cannot set host dirty when device is already dirty. Call copy_to_host() before accessing the buffer from host.");
        buf.set_host_dirty(v);
    }

    // Check if the device allocation is dirty. Called by
    // set_host_dirty, which is called by every accessor. Must be
    // inlined so it can be hoisted out of loops.
    HALIDE_ALWAYS_INLINE
    bool device_dirty() const {
        return buf.device_dirty();
    }

    bool host_dirty() const {
        return buf.host_dirty();
    }

    void set_device_dirty(bool v = true) {
        assert((!v || !host_dirty()) && "Cannot set device dirty when host is already dirty.");
        buf.set_device_dirty(v);
    }

    int copy_to_host(void *ctx = nullptr) {
        if (device_dirty()) {
            return buf.device_interface->copy_to_host(ctx, &buf);
        }
        return halide_error_code_success;
    }

    int copy_to_device(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        if (host_dirty()) {
            return device_interface->copy_to_device(ctx, &buf, device_interface);
        }
        return halide_error_code_success;
    }

    int device_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        return device_interface->device_malloc(ctx, &buf, device_interface);
    }

    int device_free(void *ctx = nullptr) {
        if (dev_ref_count) {
            assert(dev_ref_count->ownership == BufferDeviceOwnership::Allocated &&
                   "Can't call device_free on an unmanaged or wrapped native device handle. "
                   "Free the source allocation or call device_detach_native instead.");
            // Multiple people may be holding onto this dev field
            assert(dev_ref_count->count == 1 &&
                   "Multiple Halide::Runtime::Buffer objects share this device "
                   "allocation. Freeing it would create dangling references. "
                   "Don't call device_free on Halide buffers that you have copied or "
                   "passed by value.");
        }
        int ret = halide_error_code_success;
        if (buf.device_interface) {
            ret = buf.device_interface->device_free(ctx, &buf);
        }
        if (dev_ref_count) {
            delete dev_ref_count;
            dev_ref_count = nullptr;
        }
        return ret;
    }

    int device_wrap_native(const struct halide_device_interface_t *device_interface,
                           uint64_t handle, void *ctx = nullptr) {
        assert(device_interface);
        dev_ref_count = new DeviceRefCount;
        dev_ref_count->ownership = BufferDeviceOwnership::WrappedNative;
        return device_interface->wrap_native(ctx, &buf, handle, device_interface);
    }

    int device_detach_native(void *ctx = nullptr) {
        assert(dev_ref_count &&
               dev_ref_count->ownership == BufferDeviceOwnership::WrappedNative &&
               "Only call device_detach_native on buffers wrapping a native "
               "device handle via device_wrap_native. This buffer was allocated "
               "using device_malloc, or is unmanaged. "
               "Call device_free or free the original allocation instead.");
        // Multiple people may be holding onto this dev field
        assert(dev_ref_count->count == 1 &&
               "Multiple Halide::Runtime::Buffer objects share this device "
               "allocation. Freeing it could create dangling references. "
               "Don't call device_detach_native on Halide buffers that you "
               "have copied or passed by value.");
        int ret = halide_error_code_success;
        if (buf.device_interface) {
            ret = buf.device_interface->detach_native(ctx, &buf);
        }
        delete dev_ref_count;
        dev_ref_count = nullptr;
        return ret;
    }

    int device_and_host_malloc(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        return device_interface->device_and_host_malloc(ctx, &buf, device_interface);
    }

    int device_and_host_free(const struct halide_device_interface_t *device_interface, void *ctx = nullptr) {
        if (dev_ref_count) {
            assert(dev_ref_count->ownership == BufferDeviceOwnership::AllocatedDeviceAndHost &&
                   "Can't call device_and_host_free on a device handle not allocated with device_and_host_malloc. "
                   "Free the source allocation or call device_detach_native instead.");
            // Multiple people may be holding onto this dev field
            assert(dev_ref_count->count == 1 &&
                   "Multiple Halide::Runtime::Buffer objects share this device "
                   "allocation. Freeing it would create dangling references. "
                   "Don't call device_and_host_free on Halide buffers that you have copied or "
                   "passed by value.");
        }
        int ret = halide_error_code_success;
        if (buf.device_interface) {
            ret = buf.device_interface->device_and_host_free(ctx, &buf);
        }
        if (dev_ref_count) {
            delete dev_ref_count;
            dev_ref_count = nullptr;
        }
        return ret;
    }

    int device_sync(void *ctx = nullptr) {
        return buf.device_sync(ctx);
    }

    bool has_device_allocation() const {
        return buf.device != 0;
    }

    /** Return the method by which the device field is managed. */
    BufferDeviceOwnership device_ownership() const {
        if (dev_ref_count == nullptr) {
            return BufferDeviceOwnership::Allocated;
        }
        return dev_ref_count->ownership;
    }
    // @}

    /** If you use the (x, y, c) indexing convention, then Halide
     * Buffers are stored planar by default. This function constructs
     * an interleaved RGB or RGBA image that can still be indexed
     * using (x, y, c). Passing it to a generator requires that the
     * generator has been compiled with support for interleaved (also
     * known as packed or chunky) memory layouts. */
    static Buffer<void, Dims, InClassDimStorage> make_interleaved(halide_type_t t, int width, int height, int channels) {
        static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
        Buffer<void, Dims, InClassDimStorage> im(t, channels, width, height);
        // Note that this is equivalent to calling transpose({2, 0, 1}),
        // but slightly more efficient.
        im.transpose(0, 1);
        im.transpose(1, 2);
        return im;
    }

    /** If you use the (x, y, c) indexing convention, then Halide
     * Buffers are stored planar by default. This function constructs
     * an interleaved RGB or RGBA image that can still be indexed
     * using (x, y, c). Passing it to a generator requires that the
     * generator has been compiled with support for interleaved (also
     * known as packed or chunky) memory layouts. */
    static Buffer<T, Dims, InClassDimStorage> make_interleaved(int width, int height, int channels) {
        return make_interleaved(static_halide_type(), width, height, channels);
    }

    /** Wrap an existing interleaved image. */
    static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage>
    make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
        static_assert(Dims == AnyDims || Dims == 3, "make_interleaved() must be called on a Buffer that can represent 3 dimensions.");
        Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> im(t, data, channels, width, height);
        im.transpose(0, 1);
        im.transpose(1, 2);
        return im;
    }

    /** Wrap an existing interleaved image. */
    static Buffer<T, Dims, InClassDimStorage> make_interleaved(T *data, int width, int height, int channels) {
        return make_interleaved(static_halide_type(), data, width, height, channels);
    }

    /** Make a zero-dimensional Buffer */
    static Buffer<add_const_if_T_is_const<void>, Dims, InClassDimStorage> make_scalar(halide_type_t t) {
        static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
        Buffer<add_const_if_T_is_const<void>, AnyDims, InClassDimStorage> buf(t, 1);
        buf.slice(0, 0);
        return buf;
    }

    /** Make a zero-dimensional Buffer */
    static Buffer<T, Dims, InClassDimStorage> make_scalar() {
        static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
        Buffer<T, AnyDims, InClassDimStorage> buf(1);
        buf.slice(0, 0);
        return buf;
    }

    /** Make a zero-dimensional Buffer that points to non-owned, existing data */
    static Buffer<T, Dims, InClassDimStorage> make_scalar(T *data) {
        static_assert(Dims == AnyDims || Dims == 0, "make_scalar() must be called on a Buffer that can represent 0 dimensions.");
        Buffer<T, AnyDims, InClassDimStorage> buf(data, 1);
        buf.slice(0, 0);
        return buf;
    }

    /** Make a buffer with the same shape and memory nesting order as
     * another buffer. It may have a different type. */
    template<typename T2, int D2, int S2>
    // NOLINTNEXTLINE(performance-unnecessary-value-param)
    static Buffer<T, Dims, InClassDimStorage> make_with_shape_of(Buffer<T2, D2, S2> src,
                                                                 void *(*allocate_fn)(size_t) = nullptr,
                                                                 void (*deallocate_fn)(void *) = nullptr) {
        // Note that src is taken by value because its dims are mutated
        // in-place by the helper. Do not change to taking it by reference.
        static_assert(Dims == D2 || Dims == AnyDims);
        const halide_type_t dst_type = T_is_void ? src.type() : halide_type_of<std::remove_cv_t<not_void_T>>();
        return Buffer<>::make_with_shape_of_helper(dst_type, src.dimensions(), src.buf.dim,
                                                   allocate_fn, deallocate_fn);
    }

private:
    static Buffer<> make_with_shape_of_helper(halide_type_t dst_type,
                                              int dimensions,
                                              halide_dimension_t *shape,
                                              void *(*allocate_fn)(size_t),
                                              void (*deallocate_fn)(void *)) {
        // Reorder the dimensions of src to have strides in increasing order
        std::vector<int> swaps;
        for (int i = dimensions - 1; i > 0; i--) {
            for (int j = i; j > 0; j--) {
                if (shape[j - 1].stride > shape[j].stride) {
                    std::swap(shape[j - 1], shape[j]);
                    swaps.push_back(j);
                }
            }
        }

        // Rewrite the strides to be dense (this messes up src, which
        // is why we took it by value).
        for (int i = 0; i < dimensions; i++) {
            if (i == 0) {
                shape[i].stride = 1;
            } else {
                shape[i].stride = shape[i - 1].extent * shape[i - 1].stride;
            }
        }

        // Undo the dimension reordering
        while (!swaps.empty()) {
            int j = swaps.back();
            std::swap(shape[j - 1], shape[j]);
            swaps.pop_back();
        }

        // Use an explicit runtime type, and make dst a Buffer<void>, to allow
        // using this method with Buffer<void> for either src or dst.
        Buffer<> dst(dst_type, nullptr, dimensions, shape);
        dst.allocate(allocate_fn, deallocate_fn);

        return dst;
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE
        ptrdiff_t
        offset_of(int d, int first, Args... rest) const {
#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
        assert(first >= this->buf.dim[d].min);
        assert(first < this->buf.dim[d].min + this->buf.dim[d].extent);
#endif
        return offset_of(d + 1, rest...) + (ptrdiff_t)this->buf.dim[d].stride * (first - this->buf.dim[d].min);
    }

    HALIDE_ALWAYS_INLINE
    ptrdiff_t offset_of(int d) const {
        return 0;
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE storage_T *address_of(Args... args) const {
        if (T_is_void) {
            return (storage_T *)(this->buf.host) + offset_of(0, args...) * type().bytes();
        } else {
            return (storage_T *)(this->buf.host) + offset_of(0, args...);
        }
    }

    HALIDE_ALWAYS_INLINE
    ptrdiff_t offset_of(const int *pos) const {
        ptrdiff_t offset = 0;
        for (int i = this->dimensions() - 1; i >= 0; i--) {
#if HALIDE_RUNTIME_BUFFER_CHECK_INDICES
            assert(pos[i] >= this->buf.dim[i].min);
            assert(pos[i] < this->buf.dim[i].min + this->buf.dim[i].extent);
#endif
            offset += (ptrdiff_t)this->buf.dim[i].stride * (pos[i] - this->buf.dim[i].min);
        }
        return offset;
    }

    HALIDE_ALWAYS_INLINE
    storage_T *address_of(const int *pos) const {
        if (T_is_void) {
            return (storage_T *)this->buf.host + offset_of(pos) * type().bytes();
        } else {
            return (storage_T *)this->buf.host + offset_of(pos);
        }
    }

public:
    /** Get a pointer to the address of the min coordinate. */
    T *data() const {
        return (T *)(this->buf.host);
    }

    /** Access elements. Use im(...) to get a reference to an element,
     * and use &im(...) to get the address of an element. If you pass
     * fewer arguments than the buffer has dimensions, the rest are
     * treated as their min coordinate. The non-const versions set the
     * host_dirty flag to true.
     */
    //@{
    template<typename... Args,
             typename = std::enable_if_t<AllInts<Args...>::value>>
    HALIDE_ALWAYS_INLINE const not_void_T &operator()(int first, Args... rest) const {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        constexpr int expected_dims = 1 + (int)(sizeof...(rest));
        static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
        assert(!device_dirty());
        return *((const not_void_T *)(address_of(first, rest...)));
    }

    HALIDE_ALWAYS_INLINE
    const not_void_T &operator()() const {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        constexpr int expected_dims = 0;
        static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
        assert(!device_dirty());
        return *((const not_void_T *)(data()));
    }

    HALIDE_ALWAYS_INLINE
    const not_void_T &
    operator()(const int *pos) const {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        assert(!device_dirty());
        return *((const not_void_T *)(address_of(pos)));
    }

    template<typename... Args,
             typename = std::enable_if_t<AllInts<Args...>::value>>
    HALIDE_ALWAYS_INLINE not_void_T &operator()(int first, Args... rest) {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        constexpr int expected_dims = 1 + (int)(sizeof...(rest));
        static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
        set_host_dirty();
        return *((not_void_T *)(address_of(first, rest...)));
    }

    HALIDE_ALWAYS_INLINE
    not_void_T &
    operator()() {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        constexpr int expected_dims = 0;
        static_assert(Dims == AnyDims || Dims == expected_dims, "Buffer with static dimensions was accessed with the wrong number of coordinates in operator()");
        set_host_dirty();
        return *((not_void_T *)(data()));
    }

    HALIDE_ALWAYS_INLINE
    not_void_T &
    operator()(const int *pos) {
        static_assert(!T_is_void,
                      "Cannot use operator() on Buffer<void> types");
        set_host_dirty();
        return *((not_void_T *)(address_of(pos)));
    }
    // @}

    /** Tests that all values in this buffer are equal to val. */
    bool all_equal(not_void_T val) const {
        bool all_equal = true;
        for_each_element([&](const int *pos) { all_equal &= (*this)(pos) == val; });
        return all_equal;
    }

    Buffer<T, Dims, InClassDimStorage> &fill(not_void_T val) {
        set_host_dirty();
        for_each_value([=](T &v) { v = val; });
        return *this;
    }

private:
    /** Helper functions for for_each_value. */
    // @{
    template<int N>
    struct for_each_value_task_dim {
        std::ptrdiff_t extent;
        std::ptrdiff_t stride[N];
    };

    // Given an array of strides, and a bunch of pointers to pointers
    // (all of different types), advance the pointers using the
    // strides.
    template<typename Ptr, typename... Ptrs>
    HALIDE_ALWAYS_INLINE static void advance_ptrs(const std::ptrdiff_t *stride, Ptr &ptr, Ptrs &...ptrs) {
        ptr += *stride;
        advance_ptrs(stride + 1, ptrs...);
    }

    HALIDE_ALWAYS_INLINE
    static void advance_ptrs(const std::ptrdiff_t *) {
    }

    template<typename Fn, typename Ptr, typename... Ptrs>
    HALIDE_NEVER_INLINE static void for_each_value_helper(Fn &&f, int d, bool innermost_strides_are_one,
                                                          const for_each_value_task_dim<sizeof...(Ptrs) + 1> *t, Ptr ptr, Ptrs... ptrs) {
        if (d == 0) {
            if (innermost_strides_are_one) {
                Ptr end = ptr + t[0].extent;
                while (ptr != end) {
                    f(*ptr++, (*ptrs++)...);
                }
            } else {
                for (std::ptrdiff_t i = t[0].extent; i != 0; i--) {
                    f(*ptr, (*ptrs)...);
                    advance_ptrs(t[0].stride, ptr, ptrs...);
                }
            }
        } else {
            for (std::ptrdiff_t i = t[d].extent; i != 0; i--) {
                for_each_value_helper(f, d - 1, innermost_strides_are_one, t, ptr, ptrs...);
                advance_ptrs(t[d].stride, ptr, ptrs...);
            }
        }
    }

    // Return pair is <new_dimensions, innermost_strides_are_one>
    template<int N>
    HALIDE_NEVER_INLINE static std::pair<int, bool> for_each_value_prep(for_each_value_task_dim<N> *t,
                                                                        const halide_buffer_t **buffers) {
        const int dimensions = buffers[0]->dimensions;
        assert(dimensions > 0);

        // Check the buffers all have clean host allocations
        for (int i = 0; i < N; i++) {
            if (buffers[i]->device) {
                assert(buffers[i]->host &&
                       "Buffer passed to for_each_value has device allocation but no host allocation. Call allocate() and copy_to_host() first");
                assert(!buffers[i]->device_dirty() &&
                       "Buffer passed to for_each_value is dirty on device. Call copy_to_host() first");
            } else {
                assert(buffers[i]->host &&
                       "Buffer passed to for_each_value has no host or device allocation");
            }
        }

        // Extract the strides in all the dimensions
        for (int i = 0; i < dimensions; i++) {
            for (int j = 0; j < N; j++) {
                assert(buffers[j]->dimensions == dimensions);
                assert(buffers[j]->dim[i].extent == buffers[0]->dim[i].extent &&
                       buffers[j]->dim[i].min == buffers[0]->dim[i].min);
                const int s = buffers[j]->dim[i].stride;
                t[i].stride[j] = s;
            }
            t[i].extent = buffers[0]->dim[i].extent;

            // Order the dimensions by stride, so that the traversal is cache-coherent.
            // Use the last dimension for this, because this is the source in copies.
            // It appears to be better to optimize read order than write order.
            for (int j = i; j > 0 && t[j].stride[N - 1] < t[j - 1].stride[N - 1]; j--) {
                std::swap(t[j], t[j - 1]);
            }
        }

        // flatten dimensions where possible to make a larger inner
        // loop for autovectorization.
        int d = dimensions;
        for (int i = 1; i < d; i++) {
            bool flat = true;
            for (int j = 0; j < N; j++) {
                flat = flat && t[i - 1].stride[j] * t[i - 1].extent == t[i].stride[j];
            }
            if (flat) {
                t[i - 1].extent *= t[i].extent;
                for (int j = i; j < d - 1; j++) {
                    t[j] = t[j + 1];
                }
                i--;
                d--;
            }
        }

        // Note that we assert() that dimensions > 0 above
        // (our one-and-only caller will only call us that way)
        // so the unchecked access to t[0] should be safe.
        bool innermost_strides_are_one = true;
        for (int i = 0; i < N; i++) {
            innermost_strides_are_one &= (t[0].stride[i] == 1);
        }

        return {d, innermost_strides_are_one};
    }

    template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
    void for_each_value_impl(Fn &&f, Args &&...other_buffers) const {
        if (dimensions() > 0) {
            const size_t alloc_size = dimensions() * sizeof(for_each_value_task_dim<N>);
            Buffer<>::for_each_value_task_dim<N> *t =
                (Buffer<>::for_each_value_task_dim<N> *)HALIDE_ALLOCA(alloc_size);
            // Move the preparatory code into a non-templated helper to
            // save code size.
            const halide_buffer_t *buffers[] = {&buf, (&other_buffers.buf)...};
            auto [new_dims, innermost_strides_are_one] = Buffer<>::for_each_value_prep(t, buffers);
            if (new_dims > 0) {
                Buffer<>::for_each_value_helper(f, new_dims - 1,
                                                innermost_strides_are_one,
                                                t,
                                                data(), (other_buffers.data())...);
                return;
            }
            // else fall thru
        }

        // zero-dimensional case
        f(*data(), (*other_buffers.data())...);
    }
    // @}

public:
    /** Call a function on every value in the buffer, and the
     * corresponding values in some number of other buffers of the
     * same size. The function should take a reference, const
     * reference, or value of the correct type for each buffer. This
     * effectively lifts a function of scalars to an element-wise
     * function of buffers. This produces code that the compiler can
     * autovectorize. This is slightly cheaper than for_each_element,
     * because it does not need to track the coordinates.
     *
     * Note that constness of Buffers is preserved: a const Buffer<T> (for either
     * 'this' or the other-buffers arguments) will allow mutation of the
     * buffer contents, while a Buffer<const T> will not. Attempting to specify
     * a mutable reference for the lambda argument of a Buffer<const T>
     * will result in a compilation error. */
    // @{
    template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
    HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_value(Fn &&f, Args &&...other_buffers) const {
        for_each_value_impl(f, std::forward<Args>(other_buffers)...);
        return *this;
    }

    template<typename Fn, typename... Args, int N = sizeof...(Args) + 1>
    HALIDE_ALWAYS_INLINE
        Buffer<T, Dims, InClassDimStorage> &
        for_each_value(Fn &&f, Args &&...other_buffers) {
        for_each_value_impl(f, std::forward<Args>(other_buffers)...);
        return *this;
    }
    // @}

private:
    // Helper functions for for_each_element
    struct for_each_element_task_dim {
        int min, max;
    };

    /** If f is callable with this many args, call it. The first
     * argument is just to make the overloads distinct. Actual
     * overload selection is done using the enable_if. */
    template<typename Fn,
             typename... Args,
             typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
    HALIDE_ALWAYS_INLINE static void for_each_element_variadic(int, int, const for_each_element_task_dim *, Fn &&f, Args... args) {
        f(args...);
    }

    /** If the above overload is impossible, we add an outer loop over
     * an additional argument and try again. */
    template<typename Fn,
             typename... Args>
    HALIDE_ALWAYS_INLINE static void for_each_element_variadic(double, int d, const for_each_element_task_dim *t, Fn &&f, Args... args) {
        for (int i = t[d].min; i <= t[d].max; i++) {
            for_each_element_variadic(0, d - 1, t, std::forward<Fn>(f), i, args...);
        }
    }

    /** Determine the minimum number of arguments a callable can take
     * using the same trick. */
    template<typename Fn,
             typename... Args,
             typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
    HALIDE_ALWAYS_INLINE static int num_args(int, Fn &&, Args...) {
        return (int)(sizeof...(Args));
    }

    /** The recursive version is only enabled up to a recursion limit
     * of 256. This catches callables that aren't callable with any
     * number of ints. */
    template<typename Fn,
             typename... Args>
    HALIDE_ALWAYS_INLINE static int num_args(double, Fn &&f, Args... args) {
        static_assert(sizeof...(args) <= 256,
                      "Callable passed to for_each_element must accept either a const int *,"
                      " or up to 256 ints. No such operator found. Expect infinite template recursion.");
        return num_args(0, std::forward<Fn>(f), 0, args...);
    }

    /** A version where the callable takes a position array instead,
     * with compile-time recursion on the dimensionality.  This
     * overload is preferred to the one below using the same int vs
     * double trick as above, but is impossible once d hits -1 using
     * std::enable_if. */
    template<int d,
             typename Fn,
             typename = std::enable_if_t<(d >= 0)>>
    HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(int, const for_each_element_task_dim *t, Fn &&f, int *pos) {
        for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
            for_each_element_array_helper<d - 1>(0, t, f, pos);
        }
    }

    /** Base case for recursion above. */
    template<int d,
             typename Fn,
             typename = std::enable_if_t<(d < 0)>>
    HALIDE_ALWAYS_INLINE static void for_each_element_array_helper(double, const for_each_element_task_dim *t, Fn &&f, int *pos) {
        f(pos);
    }

    /** A run-time-recursive version (instead of
     * compile-time-recursive) that requires the callable to take a
     * pointer to a position array instead. Dispatches to the
     * compile-time-recursive version once the dimensionality gets
     * small. */
    template<typename Fn>
    static void for_each_element_array(int d, const for_each_element_task_dim *t, Fn &&f, int *pos) {
        if (d == -1) {
            f(pos);
        } else if (d == 0) {
            // Once the dimensionality gets small enough, dispatch to
            // a compile-time-recursive version for better codegen of
            // the inner loops.
            for_each_element_array_helper<0, Fn>(0, t, std::forward<Fn>(f), pos);
        } else if (d == 1) {
            for_each_element_array_helper<1, Fn>(0, t, std::forward<Fn>(f), pos);
        } else if (d == 2) {
            for_each_element_array_helper<2, Fn>(0, t, std::forward<Fn>(f), pos);
        } else if (d == 3) {
            for_each_element_array_helper<3, Fn>(0, t, std::forward<Fn>(f), pos);
        } else {
            for (pos[d] = t[d].min; pos[d] <= t[d].max; pos[d]++) {
                for_each_element_array(d - 1, t, f, pos);
            }
        }
    }

    /** We now have two overloads for for_each_element. This one
     * triggers if the callable takes a const int *.
     */
    template<typename Fn,
             typename = decltype(std::declval<Fn>()((const int *)nullptr))>
    static void for_each_element(int, int dims, const for_each_element_task_dim *t, Fn &&f, int check = 0) {
        const int size = dims * sizeof(int);
        int *pos = (int *)HALIDE_ALLOCA(size);
        // At least one version of GCC will (incorrectly) report that pos "may be used uninitialized".
        // Add this memset to silence it.
        memset(pos, 0, size);
        for_each_element_array(dims - 1, t, std::forward<Fn>(f), pos);
    }

    /** This one triggers otherwise. It treats the callable as
     * something that takes some number of ints. */
    template<typename Fn>
    HALIDE_ALWAYS_INLINE static void for_each_element(double, int dims, const for_each_element_task_dim *t, Fn &&f) {
        int args = num_args(0, std::forward<Fn>(f));
        assert(dims >= args);
        for_each_element_variadic(0, args - 1, t, std::forward<Fn>(f));
    }

    template<typename Fn>
    void for_each_element_impl(Fn &&f) const {
        for_each_element_task_dim *t =
            (for_each_element_task_dim *)HALIDE_ALLOCA(dimensions() * sizeof(for_each_element_task_dim));
        for (int i = 0; i < dimensions(); i++) {
            t[i].min = dim(i).min();
            t[i].max = dim(i).max();
        }
        for_each_element(0, dimensions(), t, std::forward<Fn>(f));
    }

public:
    /** Call a function at each site in a buffer. This is likely to be
     * much slower than using Halide code to populate a buffer, but is
     * convenient for tests. If the function has more arguments than the
     * buffer has dimensions, the remaining arguments will be zero. If it
     * has fewer arguments than the buffer has dimensions then the last
     * few dimensions of the buffer are not iterated over. For example,
     * the following code exploits this to set a floating point RGB image
     * to red:

     \code
     Buffer<float, 3> im(100, 100, 3);
     im.for_each_element([&](int x, int y) {
         im(x, y, 0) = 1.0f;
         im(x, y, 1) = 0.0f;
         im(x, y, 2) = 0.0f:
     });
     \endcode

     * The compiled code is equivalent to writing the a nested for loop,
     * and compilers are capable of optimizing it in the same way.
     *
     * If the callable can be called with an int * as the sole argument,
     * that version is called instead. Each location in the buffer is
     * passed to it in a coordinate array. This version is higher-overhead
     * than the variadic version, but is useful for writing generic code
     * that accepts buffers of arbitrary dimensionality. For example, the
     * following sets the value at all sites in an arbitrary-dimensional
     * buffer to their first coordinate:

     \code
     im.for_each_element([&](const int *pos) {im(pos) = pos[0];});
     \endcode

     * It is also possible to use for_each_element to iterate over entire
     * rows or columns by cropping the buffer to a single column or row
     * respectively and iterating over elements of the result. For example,
     * to set the diagonal of the image to 1 by iterating over the columns:

     \code
     Buffer<float, 3> im(100, 100, 3);
         im.sliced(1, 0).for_each_element([&](int x, int c) {
         im(x, x, c) = 1.0f;
     });
     \endcode

     * Or, assuming the memory layout is known to be dense per row, one can
     * memset each row of an image like so:

     \code
     Buffer<float, 3> im(100, 100, 3);
     im.sliced(0, 0).for_each_element([&](int y, int c) {
         memset(&im(0, y, c), 0, sizeof(float) * im.width());
     });
     \endcode

    */
    // @{
    template<typename Fn>
    HALIDE_ALWAYS_INLINE const Buffer<T, Dims, InClassDimStorage> &for_each_element(Fn &&f) const {
        for_each_element_impl(f);
        return *this;
    }

    template<typename Fn>
    HALIDE_ALWAYS_INLINE
        Buffer<T, Dims, InClassDimStorage> &
        for_each_element(Fn &&f) {
        for_each_element_impl(f);
        return *this;
    }
    // @}

private:
    template<typename Fn>
    struct FillHelper {
        Fn f;
        Buffer<T, Dims, InClassDimStorage> *buf;

        template<typename... Args,
                 typename = decltype(std::declval<Fn>()(std::declval<Args>()...))>
        void operator()(Args... args) {
            (*buf)(args...) = f(args...);
        }

        FillHelper(Fn &&f, Buffer<T, Dims, InClassDimStorage> *buf)
            : f(std::forward<Fn>(f)), buf(buf) {
        }
    };

public:
    /** Fill a buffer by evaluating a callable at every site. The
     * callable should look much like a callable passed to
     * for_each_element, but it should return the value that should be
     * stored to the coordinate corresponding to the arguments. */
    template<typename Fn,
             typename = std::enable_if_t<!std::is_arithmetic_v<std::decay_t<Fn>>>>
    Buffer<T, Dims, InClassDimStorage> &fill(Fn &&f) {
        // We'll go via for_each_element. We need a variadic wrapper lambda.
        FillHelper<Fn> wrapper(std::forward<Fn>(f), this);
        return for_each_element(wrapper);
    }

    /** Check if an input buffer passed extern stage is a querying
     * bounds. Compared to doing the host pointer check directly,
     * this both adds clarity to code and will facilitate moving to
     * another representation for bounds query arguments. */
    bool is_bounds_query() const {
        return buf.is_bounds_query();
    }

    /** Convenient check to verify that all of the interesting bytes in the Buffer
     * are initialized under MSAN. Note that by default, we use for_each_value() here so that
     * we skip any unused padding that isn't part of the Buffer; this isn't efficient,
     * but in MSAN mode, it doesn't matter. (Pass true for the flag to force check
     * the entire Buffer storage.) */
    void msan_check_mem_is_initialized(bool entire = false) const {
#if defined(__has_feature)
#if __has_feature(memory_sanitizer)
        if (entire) {
            __msan_check_mem_is_initialized(data(), size_in_bytes());
        } else {
            for_each_value([](T &v) { __msan_check_mem_is_initialized(&v, sizeof(T)); ; });
        }
#endif
#endif
    }
};

}  // namespace Runtime
}  // namespace Halide

#undef HALIDE_ALLOCA

#endif  // HALIDE_RUNTIME_IMAGE_H

namespace Halide {

constexpr int AnyDims = Halide::Runtime::AnyDims;  // -1

template<typename T = void, int Dims = AnyDims>
class Buffer;

struct JITUserContext;

namespace Internal {

struct BufferContents {
    mutable RefCount ref_count;
    std::string name;
    Runtime::Buffer<> buf;
};

Expr buffer_accessor(const Buffer<> &buf, const std::vector<Expr> &args);

template<typename... Args>
struct all_ints_and_optional_name : std::false_type {};

template<typename First, typename... Rest>
struct all_ints_and_optional_name<First, Rest...> : meta_and<std::is_convertible<First, int>,
                                                             all_ints_and_optional_name<Rest...>> {};

template<typename T>
struct all_ints_and_optional_name<T> : meta_or<std::is_convertible<T, std::string>,
                                               std::is_convertible<T, int>> {};

template<>
struct all_ints_and_optional_name<> : std::true_type {};

template<typename T,
         typename = std::enable_if_t<!std::is_convertible_v<T, std::string>>>
std::string get_name_from_end_of_parameter_pack(T &&) {
    return "";
}

inline std::string get_name_from_end_of_parameter_pack(const std::string &n) {
    return n;
}

inline std::string get_name_from_end_of_parameter_pack() {
    return "";
}

template<typename First,
         typename Second,
         typename... Args>
std::string get_name_from_end_of_parameter_pack(First first, Second second, Args &&...rest) {
    return get_name_from_end_of_parameter_pack(second, std::forward<Args>(rest)...);
}

inline void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &, const std::string &) {
}

inline void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &) {
}

template<typename... Args>
void get_shape_from_start_of_parameter_pack_helper(std::vector<int> &result, int x, Args &&...rest) {
    result.push_back(x);
    get_shape_from_start_of_parameter_pack_helper(result, std::forward<Args>(rest)...);
}

template<typename... Args>
std::vector<int> get_shape_from_start_of_parameter_pack(Args &&...args) {
    std::vector<int> result;
    get_shape_from_start_of_parameter_pack_helper(result, std::forward<Args>(args)...);
    return result;
}

template<typename T, typename T2>
using add_const_if_T_is_const = std::conditional_t<std::is_const_v<T>, const T2, T2>;

// Helpers to produce the name of a Buffer element type (a Halide
// scalar type, or void, possibly with const). Useful for an error
// messages.
template<typename T>
void buffer_type_name_non_const(std::ostream &s) {
    s << type_to_c_type(type_of<T>(), false);
}

template<>
inline void buffer_type_name_non_const<void>(std::ostream &s) {
    s << "void";
}

template<typename T>
std::string buffer_type_name() {
    std::ostringstream oss;
    if (std::is_const_v<T>) {
        oss << "const ";
    }
    buffer_type_name_non_const<std::remove_const_t<T>>(oss);
    return oss.str();
}

}  // namespace Internal

/** A Halide::Buffer is a named shared reference to a
 * Halide::Runtime::Buffer.
 *
 * A Buffer<T1, D> can refer to a Buffer<T2, D> if T1 is const whenever T2
 * is const, and either T1 = T2 or T1 is void. A Buffer<void, D> can
 * refer to any Buffer of any non-const type, and the default
 * template parameter is T = void.
 *
 * A Buffer<T, D1> can refer to a Buffer<T, D2> if D1 == D2,
 * or if D1 is AnyDims (meaning "dimensionality is checked at runtime, not compiletime").
 */
template<typename T, int Dims>
class Buffer {
    Internal::IntrusivePtr<Internal::BufferContents> contents;

    template<typename T2, int D2>
    friend class Buffer;

    template<typename T2, int D2>
    static void assert_can_convert_from(const Buffer<T2, D2> &other) {
        if (!other.defined()) {
            // Avoid UB of deferencing offset of a null contents ptr
            static_assert(!std::is_const_v<T2> || std::is_const_v<T>,
                          "Can't convert from a Buffer<const T> to a Buffer<T>");
            static_assert(std::is_same_v<std::remove_const_t<T>, std::remove_const_t<T2>> ||
                              std::is_void_v<T> ||
                              std::is_void_v<T2>,
                          "type mismatch constructing Buffer");
            static_assert(Dims == AnyDims || D2 == AnyDims || Dims == D2,
                          "Can't convert from a Buffer with static dimensionality to a Buffer with different static dimensionality");
        } else {
            // Don't delegate to
            // Runtime::Buffer<T>::assert_can_convert_from. It might
            // not assert if NDEBUG is defined. user_assert is
            // friendlier anyway because it reports line numbers when
            // debugging symbols are found, it throws an exception
            // when exceptions are enabled, and we can print the
            // actual types in question.
            using BufType = Runtime::Buffer<T, Dims>;  // alias because commas in user_assert() macro confuses compiler
            user_assert(BufType::can_convert_from(*(other.get())))
                << "Type mismatch constructing Buffer. Can't construct Buffer<"
                << Internal::buffer_type_name<T>() << ", " << Dims << "> from Buffer<"
                << type_to_c_type(other.type(), false) << ", " << D2 << ">, dimensions() = " << other.dimensions() << "\n";
        }
    }

public:
    static constexpr int AnyDims = Halide::AnyDims;
    static_assert(Dims == AnyDims || Dims >= 0);

    typedef T ElemType;

    // This class isn't final (and is subclassed from the Python binding
    // code, at least) so it needs a virtual dtor.
    virtual ~Buffer() = default;

    /** Make a null Buffer, which points to no Runtime::Buffer */
    Buffer() = default;

    /** Trivial copy constructor. */
    Buffer(const Buffer &that) = default;

    /** Trivial copy assignment operator. */
    Buffer &operator=(const Buffer &that) = default;

    /** Trivial move assignment operator. */
    Buffer &operator=(Buffer &&) noexcept = default;

    /** Make a Buffer from a Buffer of a different type */
    template<typename T2, int D2>
    Buffer(const Buffer<T2, D2> &other)
        : contents(other.contents) {
        assert_can_convert_from(other);
    }

    /** Move construct from a Buffer of a different type */
    template<typename T2, int D2>
    Buffer(Buffer<T2, D2> &&other) noexcept {
        assert_can_convert_from(other);
        contents = std::move(other.contents);
    }

    /** Construct a Buffer that captures and owns an rvalue Runtime::Buffer */
    template<int D2>
    Buffer(Runtime::Buffer<T, D2> &&buf, const std::string &name = "")
        : contents(new Internal::BufferContents) {
        contents->buf = std::move(buf);
        if (name.empty()) {
            contents->name = Internal::unique_name('b');
        } else {
            contents->name = name;
        }
    }

    /** Constructors that match Runtime::Buffer with two differences:
     * 1) They take a Type instead of a halide_type_t
     * 2) There is an optional last string argument that gives the buffer a specific name
     */
    // @{
    template<typename... Args,
             typename = std::enable_if_t<Internal::all_ints_and_optional_name<Args...>::value>>
    explicit Buffer(Type t,
                    int first, Args... rest)
        : Buffer(Runtime::Buffer<T, Dims>(t, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                 Internal::get_name_from_end_of_parameter_pack(rest...)) {
    }

    explicit Buffer(const halide_buffer_t &buf,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(buf), name) {
    }

    template<typename... Args,
             typename = std::enable_if_t<Internal::all_ints_and_optional_name<Args...>::value>>
    explicit Buffer(int first, const Args &...rest)
        : Buffer(Runtime::Buffer<T, Dims>(Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                 Internal::get_name_from_end_of_parameter_pack(rest...)) {
    }

    explicit Buffer(Type t,
                    const std::vector<int> &sizes,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(t, sizes), name) {
    }

    explicit Buffer(Type t,
                    const std::vector<int> &sizes,
                    const std::vector<int> &storage_order,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(t, sizes, storage_order), name) {
    }

    explicit Buffer(const std::vector<int> &sizes,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(sizes), name) {
    }

    explicit Buffer(const std::vector<int> &sizes,
                    const std::vector<int> &storage_order,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(sizes, storage_order), name) {
    }

    template<typename Array, size_t N>
    explicit Buffer(Array (&vals)[N],
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(vals), name) {
    }

    template<typename... Args,
             typename = std::enable_if_t<Internal::all_ints_and_optional_name<Args...>::value>>
    explicit Buffer(Type t,
                    Internal::add_const_if_T_is_const<T, void> *data,
                    int first, Args &&...rest)
        : Buffer(Runtime::Buffer<T, Dims>(t, data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                 Internal::get_name_from_end_of_parameter_pack(rest...)) {
    }

    template<typename... Args,
             typename = std::enable_if_t<Internal::all_ints_and_optional_name<Args...>::value>>
    explicit Buffer(Type t,
                    Internal::add_const_if_T_is_const<T, void> *data,
                    const std::vector<int> &sizes,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(t, data, sizes, name)) {
    }

    template<typename... Args,
             typename = std::enable_if_t<Internal::all_ints_and_optional_name<Args...>::value>>
    explicit Buffer(T *data,
                    int first, Args &&...rest)
        : Buffer(Runtime::Buffer<T, Dims>(data, Internal::get_shape_from_start_of_parameter_pack(first, rest...)),
                 Internal::get_name_from_end_of_parameter_pack(rest...)) {
    }

    explicit Buffer(T *data,
                    const std::vector<int> &sizes,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(data, sizes), name) {
    }

    explicit Buffer(Type t,
                    Internal::add_const_if_T_is_const<T, void> *data,
                    const std::vector<int> &sizes,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(t, data, sizes), name) {
    }

    explicit Buffer(Type t,
                    Internal::add_const_if_T_is_const<T, void> *data,
                    int d,
                    const halide_dimension_t *shape,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(t, data, d, shape), name) {
    }

    explicit Buffer(T *data,
                    int d,
                    const halide_dimension_t *shape,
                    const std::string &name = "")
        : Buffer(Runtime::Buffer<T, Dims>(data, d, shape), name) {
    }

    static Buffer<T, Dims> make_scalar(const std::string &name = "") {
        return Buffer<T, Dims>(Runtime::Buffer<T, Dims>::make_scalar(), name);
    }

    static Buffer<> make_scalar(Type t, const std::string &name = "") {
        return Buffer<>(Runtime::Buffer<>::make_scalar(t), name);
    }

    static Buffer<T, Dims> make_scalar(T *data, const std::string &name = "") {
        return Buffer<T, Dims>(Runtime::Buffer<T, Dims>::make_scalar(data), name);
    }

    static Buffer<T, Dims> make_interleaved(int width, int height, int channels, const std::string &name = "") {
        return Buffer<T, Dims>(Runtime::Buffer<T, Dims>::make_interleaved(width, height, channels), name);
    }

    static Buffer<> make_interleaved(Type t, int width, int height, int channels, const std::string &name = "") {
        return Buffer<>(Runtime::Buffer<>::make_interleaved(t, width, height, channels), name);
    }

    static Buffer<T, Dims> make_interleaved(T *data, int width, int height, int channels, const std::string &name = "") {
        return Buffer<T, Dims>(Runtime::Buffer<T, Dims>::make_interleaved(data, width, height, channels), name);
    }

    static Buffer<Internal::add_const_if_T_is_const<T, void>>
    make_interleaved(Type t, T *data, int width, int height, int channels, const std::string &name = "") {
        using T2 = Internal::add_const_if_T_is_const<T, void>;
        return Buffer<T2, Dims>(Runtime::Buffer<T2, Dims>::make_interleaved(t, data, width, height, channels), name);
    }

    template<typename T2, int D2>
    static Buffer<T, Dims> make_with_shape_of(Buffer<T2, D2> src,
                                              void *(*allocate_fn)(size_t) = nullptr,
                                              void (*deallocate_fn)(void *) = nullptr,
                                              const std::string &name = "") {
        return Buffer<T, Dims>(Runtime::Buffer<T, Dims>::make_with_shape_of(*src.get(), allocate_fn, deallocate_fn), name);
    }

    template<typename T2, int D2>
    static Buffer<T, Dims> make_with_shape_of(const Runtime::Buffer<T2, D2> &src,
                                              void *(*allocate_fn)(size_t) = nullptr,
                                              void (*deallocate_fn)(void *) = nullptr,
                                              const std::string &name = "") {
        return Buffer<T, Dims>(Runtime::Buffer<T, Dims>::make_with_shape_of(src, allocate_fn, deallocate_fn), name);
    }
    // @}

    /** Buffers are optionally named. */
    // @{
    void set_name(const std::string &n) {
        contents->name = n;
    }

    const std::string &name() const {
        return contents->name;
    }
    // @}

    /** Check if two Buffer objects point to the same underlying Buffer */
    template<typename T2, int D2>
    bool same_as(const Buffer<T2, D2> &other) const {
        return (const void *)(contents.get()) == (const void *)(other.contents.get());
    }

    /** Check if this Buffer refers to an existing
     * Buffer. Default-constructed Buffer objects do not refer to any
     * existing Buffer. */
    bool defined() const {
        return contents.defined();
    }

    /** Get a pointer to the underlying Runtime::Buffer */
    // @{
    Runtime::Buffer<T, Dims> *get() {
        // It's already type-checked, so no need to use as<T>.
        return (Runtime::Buffer<T, Dims> *)(&contents->buf);
    }
    const Runtime::Buffer<T, Dims> *get() const {
        return (const Runtime::Buffer<T, Dims> *)(&contents->buf);
    }
    // @}

    // We forward numerous methods from the underlying Buffer
#define HALIDE_BUFFER_FORWARD_CONST(method)                                                                                             \
    template<typename... Args>                                                                                                          \
    auto method(Args &&...args) const -> decltype(std::declval<const Runtime::Buffer<T, Dims>>().method(std::forward<Args>(args)...)) { \
        user_assert(defined()) << "Undefined buffer calling const method " #method "\n";                                                \
        return get()->method(std::forward<Args>(args)...);                                                                              \
    }

#define HALIDE_BUFFER_FORWARD(method)                                                                                       \
    template<typename... Args>                                                                                              \
    auto method(Args &&...args) -> decltype(std::declval<Runtime::Buffer<T, Dims>>().method(std::forward<Args>(args)...)) { \
        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                                          \
        return get()->method(std::forward<Args>(args)...);                                                                  \
    }

// This is a weird-looking but effective workaround for a deficiency in "perfect forwarding":
// namely, it can't really handle initializer-lists. The idea here is that we declare
// the expected type to be passed on, and that allows the compiler to handle it.
// The weirdness comes in with the variadic macro: the problem is that the type
// we want to forward might be something like `std::vector<std::pair<int, int>>`,
// which contains a comma, which throws a big wrench in C++ macro system.
// However... since all we really need to do is capture the remainder of the macro,
// and forward it as is, we can just use ... to allow an arbitrary number of commas,
// then use __VA_ARGS__ to forward the mess as-is, and while it looks horrible, it
// works.
#define HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(method, ...)                                                    \
    inline auto method(const __VA_ARGS__ &a) -> decltype(std::declval<Runtime::Buffer<T, Dims>>().method(a)) { \
        user_assert(defined()) << "Undefined buffer calling method " #method "\n";                             \
        return get()->method(a);                                                                               \
    }

    /** Does the same thing as the equivalent Halide::Runtime::Buffer method */
    // @{
    HALIDE_BUFFER_FORWARD(raw_buffer)
    HALIDE_BUFFER_FORWARD_CONST(raw_buffer)
    HALIDE_BUFFER_FORWARD_CONST(dimensions)
    HALIDE_BUFFER_FORWARD_CONST(dim)
    HALIDE_BUFFER_FORWARD_CONST(width)
    HALIDE_BUFFER_FORWARD_CONST(height)
    HALIDE_BUFFER_FORWARD_CONST(channels)
    HALIDE_BUFFER_FORWARD_CONST(min)
    HALIDE_BUFFER_FORWARD_CONST(extent)
    HALIDE_BUFFER_FORWARD_CONST(stride)
    HALIDE_BUFFER_FORWARD_CONST(left)
    HALIDE_BUFFER_FORWARD_CONST(right)
    HALIDE_BUFFER_FORWARD_CONST(top)
    HALIDE_BUFFER_FORWARD_CONST(bottom)
    HALIDE_BUFFER_FORWARD_CONST(number_of_elements)
    HALIDE_BUFFER_FORWARD_CONST(size_in_bytes)
    HALIDE_BUFFER_FORWARD_CONST(begin)
    HALIDE_BUFFER_FORWARD_CONST(end)
    HALIDE_BUFFER_FORWARD(data)
    HALIDE_BUFFER_FORWARD_CONST(data)
    HALIDE_BUFFER_FORWARD_CONST(contains)
    HALIDE_BUFFER_FORWARD(crop)
    HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(crop, std::vector<std::pair<int, int>>)
    HALIDE_BUFFER_FORWARD_CONST(cropped)
    HALIDE_BUFFER_FORWARD(slice)
    HALIDE_BUFFER_FORWARD_CONST(sliced)
    HALIDE_BUFFER_FORWARD(embed)
    HALIDE_BUFFER_FORWARD_CONST(embedded)
    HALIDE_BUFFER_FORWARD(set_min)
    HALIDE_BUFFER_FORWARD(translate)
    HALIDE_BUFFER_FORWARD_INITIALIZER_LIST(translate, std::vector<int>)
    HALIDE_BUFFER_FORWARD_CONST(translated)
    HALIDE_BUFFER_FORWARD(transpose)
    HALIDE_BUFFER_FORWARD_CONST(transposed)
    HALIDE_BUFFER_FORWARD(add_dimension)
    HALIDE_BUFFER_FORWARD(copy_to_host)
    HALIDE_BUFFER_FORWARD(copy_to_device)
    HALIDE_BUFFER_FORWARD_CONST(has_device_allocation)
    HALIDE_BUFFER_FORWARD_CONST(host_dirty)
    HALIDE_BUFFER_FORWARD_CONST(device_dirty)
    HALIDE_BUFFER_FORWARD(set_host_dirty)
    HALIDE_BUFFER_FORWARD(set_device_dirty)
    HALIDE_BUFFER_FORWARD(device_sync)
    HALIDE_BUFFER_FORWARD(device_malloc)
    HALIDE_BUFFER_FORWARD(device_wrap_native)
    HALIDE_BUFFER_FORWARD(device_detach_native)
    HALIDE_BUFFER_FORWARD(allocate)
    HALIDE_BUFFER_FORWARD(deallocate)
    HALIDE_BUFFER_FORWARD(device_deallocate)
    HALIDE_BUFFER_FORWARD(device_free)
    HALIDE_BUFFER_FORWARD_CONST(all_equal)

#undef HALIDE_BUFFER_FORWARD
#undef HALIDE_BUFFER_FORWARD_CONST

    template<typename Fn, typename... Args>
    Buffer<T, Dims> &for_each_value(Fn &&f, Args... other_buffers) {
        get()->for_each_value(std::forward<Fn>(f), (*std::forward<Args>(other_buffers).get())...);
        return *this;
    }

    template<typename Fn, typename... Args>
    const Buffer<T, Dims> &for_each_value(Fn &&f, Args... other_buffers) const {
        get()->for_each_value(std::forward<Fn>(f), (*std::forward<Args>(other_buffers).get())...);
        return *this;
    }

    template<typename Fn>
    Buffer<T, Dims> &for_each_element(Fn &&f) {
        get()->for_each_element(std::forward<Fn>(f));
        return *this;
    }

    template<typename Fn>
    const Buffer<T, Dims> &for_each_element(Fn &&f) const {
        get()->for_each_element(std::forward<Fn>(f));
        return *this;
    }

    template<typename FnOrValue>
    Buffer<T, Dims> &fill(FnOrValue &&f) {
        get()->fill(std::forward<FnOrValue>(f));
        return *this;
    }

    static constexpr bool has_static_halide_type = Runtime::Buffer<T, Dims>::has_static_halide_type;

    static constexpr halide_type_t static_halide_type() {
        return Runtime::Buffer<T, Dims>::static_halide_type();
    }

    static constexpr bool has_static_dimensions = Runtime::Buffer<T, Dims>::has_static_dimensions;

    static constexpr int static_dimensions() {
        return Runtime::Buffer<T, Dims>::static_dimensions();
    }

    template<typename T2, int D2>
    static bool can_convert_from(const Buffer<T2, D2> &other) {
        return Halide::Runtime::Buffer<T, Dims>::can_convert_from(*other.get());
    }

    // Note that since Runtime::Buffer stores halide_type_t rather than Halide::Type,
    // there is no handle-specific type information, so all handle types are
    // considered equivalent to void* here. (This only matters if you are making
    // a Buffer-of-handles, which is not really a real use case...)
    Type type() const {
        return contents->buf.type();
    }

    template<typename T2, int D2 = Dims>
    Buffer<T2, D2> as() const {
        return Buffer<T2, D2>(*this);
    }

    Buffer<T, Dims> copy() const {
        return Buffer<T, Dims>(std::move(contents->buf.as<T, Dims>().copy()));
    }

    template<typename T2, int D2>
    void copy_from(const Buffer<T2, D2> &other) {
        contents->buf.copy_from(*other.get());
    }

    template<typename... Args>
    auto operator()(int first, Args &&...args) -> decltype(std::declval<Runtime::Buffer<T, Dims>>()(first, std::forward<Args>(args)...)) {
        return (*get())(first, std::forward<Args>(args)...);
    }

    template<typename... Args>
    auto operator()(int first, Args &&...args) const -> decltype(std::declval<const Runtime::Buffer<T, Dims>>()(first, std::forward<Args>(args)...)) {
        return (*get())(first, std::forward<Args>(args)...);
    }

    auto operator()(const int *pos) -> decltype(std::declval<Runtime::Buffer<T, Dims>>()(pos)) {
        return (*get())(pos);
    }

    auto operator()(const int *pos) const -> decltype(std::declval<const Runtime::Buffer<T, Dims>>()(pos)) {
        return (*get())(pos);
    }

    auto operator()() -> decltype(std::declval<Runtime::Buffer<T, Dims>>()()) {
        return (*get())();
    }

    auto operator()() const -> decltype(std::declval<const Runtime::Buffer<T, Dims>>()()) {
        return (*get())();
    }
    // @}

    /** Make an Expr that loads from this concrete buffer at a computed
     * coordinate. Returned Expr is const so that it's not possible to
     * accidentally treat a buffer like a Func and try to assign an Expr to a
     * given symbolic coordinate. */
    // @{
    template<typename... Args>
    const Expr operator()(const Expr &first, const Args &...rest) const {  // NOLINT(readability-const-return-type)
        std::vector<Expr> args = {first, rest...};
        return (*this)(args);
    }

    template<typename... Args>
    const Expr operator()(const std::vector<Expr> &args) const {  // NOLINT(readability-const-return-type)
        return buffer_accessor(Buffer<>(*this), args);
    }
    // @}

    /** Copy to the GPU, using the device API that is the default for the given Target. */
    int copy_to_device(const Target &t = get_jit_target_from_environment(), JITUserContext *context = nullptr) {
        return copy_to_device(DeviceAPI::Default_GPU, t, context);
    }

    /** Copy to the GPU, using the given device API */
    int copy_to_device(const DeviceAPI &d, const Target &t = get_jit_target_from_environment(), JITUserContext *context = nullptr) {
        return contents->buf.copy_to_device(get_device_interface_for_device_api(d, t, "Buffer::copy_to_device"), context);
    }

    /** Allocate on the GPU, using the device API that is the default for the given Target. */
    int device_malloc(const Target &t = get_jit_target_from_environment(), JITUserContext *context = nullptr) {
        return device_malloc(DeviceAPI::Default_GPU, t, context);
    }

    /** Allocate storage on the GPU, using the given device API */
    int device_malloc(const DeviceAPI &d, const Target &t = get_jit_target_from_environment(), JITUserContext *context = nullptr) {
        return contents->buf.device_malloc(get_device_interface_for_device_api(d, t, "Buffer::device_malloc"), context);
    }

    /** Wrap a native handle, using the given device API.
     * It is a bad idea to pass DeviceAPI::Default_GPU to this routine
     * as the handle argument must match the API that the default
     * resolves to and it is clearer and more reliable to pass the
     * resolved DeviceAPI explicitly. */
    int device_wrap_native(const DeviceAPI &d, uint64_t handle, const Target &t = get_jit_target_from_environment(), JITUserContext *context = nullptr) {
        return contents->buf.device_wrap_native(get_device_interface_for_device_api(d, t, "Buffer::device_wrap_native"), handle, context);
    }
};

}  // namespace Halide

#endif
#ifndef HALIDE_JIT_MODULE_H
#define HALIDE_JIT_MODULE_H

/** \file
 * Defines the struct representing lifetime and dependencies of
 * a JIT compiled halide pipeline
 */

#include <map>
#include <memory>
#include <vector>

#ifndef HALIDE_WASM_EXECUTOR_H
#define HALIDE_WASM_EXECUTOR_H

/** \file
 *
 * Support for running Halide-compiled Wasm code in-process.
 * Bindings for parameters, extern calls, etc. are established and the
 * Wasm code is executed. Allows calls to realize to work
 * exactly as if native code had been run, but via a JavaScript/Wasm VM.
 * Currently, only the WABT interpreter is supported.
 */

#ifndef HALIDE_ARGUMENT_H
#define HALIDE_ARGUMENT_H

/** \file
 * Defines a type used for expressing the type signature of a
 * generated halide pipeline
 */


namespace Halide {

template<typename T, int Dims>
class Buffer;

struct ArgumentEstimates {
    /** If this is a scalar argument, then these are its default, min, max, and estimated values.
     * For buffer arguments, all should be undefined. */
    Expr scalar_def, scalar_min, scalar_max, scalar_estimate;

    /** If this is a buffer argument, these are the estimated min and
     * extent for each dimension.  If there are no estimates,
     * buffer_estimates.size() can be zero; otherwise, it must always
     * equal the dimensions */
    Region buffer_estimates;

    bool operator==(const ArgumentEstimates &rhs) const;
};

/**
 * A struct representing an argument to a halide-generated
 * function. Used for specifying the function signature of
 * generated code.
 */
struct Argument {
    /** The name of the argument */
    std::string name;

    /** An argument is either a primitive type (for parameters), or a
     * buffer pointer.
     *
     * If kind == InputScalar, then type fully encodes the expected type
     * of the scalar argument.
     *
     * If kind == InputBuffer|OutputBuffer, then type.bytes() should be used
     * to determine* elem_size of the buffer; additionally, type.code *should*
     * reflect the expected interpretation of the buffer data (e.g. float vs int),
     * but there is no runtime enforcement of this at present.
     */
    enum Kind {
        InputScalar = halide_argument_kind_input_scalar,
        InputBuffer = halide_argument_kind_input_buffer,
        OutputBuffer = halide_argument_kind_output_buffer
    };
    Kind kind = InputScalar;

    /** If kind == InputBuffer|OutputBuffer, this is the dimensionality of the buffer.
     * If kind == InputScalar, this value is ignored (and should always be set to zero) */
    uint8_t dimensions = 0;

    /** If this is a scalar parameter, then this is its type.
     *
     * If this is a buffer parameter, this this is the type of its
     * elements.
     *
     * Note that type.lanes should always be 1 here. */
    Type type;

    /* The estimates (if any) and default/min/max values (if any) for this Argument. */
    ArgumentEstimates argument_estimates;

    Argument() = default;
    Argument(const std::string &_name, Kind _kind, const Type &_type, int _dimensions,
             const ArgumentEstimates &argument_estimates);

    // Not explicit, so that you can put Buffer in an argument list,
    // to indicate that it shouldn't be baked into the object file,
    // but instead received as an argument at runtime
    template<typename T, int Dims>
    Argument(const Buffer<T, Dims> &im)
        : name(im.name()),
          kind(InputBuffer),
          dimensions(im.dimensions()),
          type(im.type()) {
    }

    bool is_buffer() const {
        return kind == InputBuffer || kind == OutputBuffer;
    }
    bool is_scalar() const {
        return kind == InputScalar;
    }

    bool is_input() const {
        return kind == InputScalar || kind == InputBuffer;
    }
    bool is_output() const {
        return kind == OutputBuffer;
    }

    bool operator==(const Argument &rhs) const {
        return name == rhs.name &&
               kind == rhs.kind &&
               dimensions == rhs.dimensions &&
               type == rhs.type &&
               argument_estimates == rhs.argument_estimates;
    }
};

}  // namespace Halide

#endif
#ifndef HALIDE_PARAMETER_H
#define HALIDE_PARAMETER_H

/** \file
 * Defines the internal representation of parameters to halide piplines
 */
#include <optional>
#include <string>


namespace Halide {

struct ArgumentEstimates;
struct Expr;
struct Type;
enum class MemoryType;

struct BufferConstraint {
    Expr min, extent, stride;
    Expr min_estimate, extent_estimate;
};

namespace Internal {

#ifdef WITH_SERIALIZATION
class Deserializer;
class Serializer;
#endif
struct ParameterContents;

}  // namespace Internal

/** A reference-counted handle to a parameter to a halide
 * pipeline. May be a scalar parameter or a buffer */
class Parameter {
    void check_defined() const;
    void check_is_buffer() const;
    void check_is_scalar() const;
    void check_dim_ok(int dim) const;
    void check_type(const Type &t) const;

protected:
    Internal::IntrusivePtr<Internal::ParameterContents> contents;

#ifdef WITH_SERIALIZATION
    friend class Internal::Deserializer;  //< for scalar_data()
    friend class Internal::Serializer;    //< for scalar_data()
#endif
    friend class Pipeline;  //< for read_only_scalar_address()

    /** Get the raw currently-bound buffer. null if unbound */
    const halide_buffer_t *raw_buffer() const;

    /** Get the pointer to the current value of the scalar
     * parameter. For a given parameter, this address will never
     * change. Note that this can only be used to *read* from -- it must
     * not be written to, so don't cast away the constness. Only relevant when jitting. */
    const void *read_only_scalar_address() const;

    /** If the Parameter is a scalar, and the scalar data is valid, return
     * the scalar data. Otherwise, return nullopt. */
    std::optional<halide_scalar_value_t> scalar_data() const;

    /** If the Parameter is a scalar and has a valid scalar value, return it.
     * Otherwise, assert-fail. */
    halide_scalar_value_t scalar_data_checked() const;

    /** If the Parameter is a scalar *of the given type* and has a valid scalar value, return it.
     * Otherwise, assert-fail. */
    halide_scalar_value_t scalar_data_checked(const Type &val_type) const;

    /** Construct a new buffer parameter via deserialization. */
    Parameter(const Type &t, int dimensions, const std::string &name,
              const Buffer<void> &buffer, int host_alignment, const std::vector<BufferConstraint> &buffer_constraints,
              MemoryType memory_type);

    /** Construct a new scalar parameter via deserialization. */
    Parameter(const Type &t, int dimensions, const std::string &name,
              const std::optional<halide_scalar_value_t> &scalar_data, const Expr &scalar_default, const Expr &scalar_min,
              const Expr &scalar_max, const Expr &scalar_estimate);

public:
    /** Construct a new undefined handle */
    Parameter() = default;

    /** Construct a new parameter of the given type. If the second
     * argument is true, this is a buffer parameter of the given
     * dimensionality, otherwise, it is a scalar parameter (and the
     * dimensionality should be zero). The parameter will be given a
     * unique auto-generated name. */
    Parameter(const Type &t, bool is_buffer, int dimensions);

    /** Construct a new parameter of the given type with name given by
     * the third argument. If the second argument is true, this is a
     * buffer parameter, otherwise, it is a scalar parameter. The
     * third argument gives the dimensionality of the buffer
     * parameter. It should be zero for scalar parameters. If the
     * fifth argument is true, the the name being passed in was
     * explicitly specified (as opposed to autogenerated). */
    Parameter(const Type &t, bool is_buffer, int dimensions, const std::string &name);

    Parameter(const Parameter &) = default;
    Parameter &operator=(const Parameter &) = default;
    Parameter(Parameter &&) = default;
    Parameter &operator=(Parameter &&) = default;

    /** Get the type of this parameter */
    Type type() const;

    /** Get the dimensionality of this parameter. Zero for scalars. */
    int dimensions() const;

    /** Get the name of this parameter */
    const std::string &name() const;

    /** Does this parameter refer to a buffer/image? */
    bool is_buffer() const;

    /** If the parameter is a scalar parameter, get its currently
     * bound value. Only relevant when jitting */
    template<typename T>
    HALIDE_NO_USER_CODE_INLINE T scalar() const {
        static_assert(sizeof(T) <= sizeof(halide_scalar_value_t));
        const auto sv = scalar_data_checked(type_of<T>());
        T t;
        memcpy(&t, &sv.u.u64, sizeof(t));
        return t;
    }

    /** This returns the current value of scalar<type()>() as an Expr.
     * If the Parameter is not scalar, or its scalar data is not valid, this will assert-fail. */
    Expr scalar_expr() const;

    /** This returns true if scalar_expr() would return a valid Expr,
     * false if not. */
    bool has_scalar_value() const;

    /** If the parameter is a scalar parameter, set its current
     * value. Only relevant when jitting */
    template<typename T>
    HALIDE_NO_USER_CODE_INLINE void set_scalar(T val) {
        halide_scalar_value_t sv;
        memcpy(&sv.u.u64, &val, sizeof(val));
        set_scalar(type_of<T>(), sv);
    }

    /** If the parameter is a scalar parameter, set its current
     * value. Only relevant when jitting */
    void set_scalar(const Type &val_type, halide_scalar_value_t val);

    /** If the parameter is a buffer parameter, get its currently
     * bound buffer. Only relevant when jitting */
    Buffer<void> buffer() const;

    /** If the parameter is a buffer parameter, set its current
     * value. Only relevant when jitting */
    void set_buffer(const Buffer<void> &b);

    /** Tests if this handle is the same as another handle */
    bool same_as(const Parameter &other) const;

    /** Tests if this handle is non-nullptr */
    bool defined() const;

    /** Get and set constraints for the min, extent, stride, and estimates on
     * the min/extent. */
    //@{
    void set_min_constraint(int dim, const Expr &e);
    void set_extent_constraint(int dim, const Expr &e);
    void set_stride_constraint(int dim, const Expr &e);
    void set_min_constraint_estimate(int dim, const Expr &min);
    void set_extent_constraint_estimate(int dim, const Expr &extent);
    void set_host_alignment(int bytes);
    Expr min_constraint(int dim) const;
    Expr extent_constraint(int dim) const;
    Expr stride_constraint(int dim) const;
    Expr min_constraint_estimate(int dim) const;
    Expr extent_constraint_estimate(int dim) const;
    int host_alignment() const;
    //@}

    /** Get buffer constraints for all dimensions,
     *  only relevant when serializing. */
    const std::vector<BufferConstraint> &buffer_constraints() const;

    /** Get and set constraints for scalar parameters. These are used
     * directly by Param, so they must be exported. */
    // @{
    void set_min_value(const Expr &e);
    Expr min_value() const;
    void set_max_value(const Expr &e);
    Expr max_value() const;
    void set_estimate(Expr e);
    Expr estimate() const;
    // @}

    /** Get and set the default values for scalar parameters. At present, these
     * are used only to emit the default values in the metadata. There isn't
     * yet a public API in Param<> for them (this is used internally by the
     * Generator code). */
    // @{
    void set_default_value(const Expr &e);
    Expr default_value() const;
    // @}

    /** Order Parameters by their IntrusivePtr so they can be used
     * to index maps. */
    bool operator<(const Parameter &other) const {
        return contents < other.contents;
    }

    /** Get the ArgumentEstimates appropriate for this Parameter. */
    ArgumentEstimates get_argument_estimates() const;

    void store_in(MemoryType memory_type);
    MemoryType memory_type() const;

    void trace_loads();
    bool is_tracing_loads() const;

    void add_trace_tag(const std::string &trace_tag);
    std::vector<std::string> get_trace_tags() const;
};

namespace Internal {

/** Validate arguments to a call to a func, image or imageparam. */
void check_call_arg_types(const std::string &name, std::vector<Expr> *args, int dims);

}  // namespace Internal
}  // namespace Halide

#endif

#include <map>
#include <string>
#include <vector>

namespace Halide {

struct JITExtern;
struct Target;

namespace Internal {

struct JITModule;
struct WasmModuleContents;

/** Handle to compiled wasm code which can be called later. */
struct WasmModule {
    Internal::IntrusivePtr<WasmModuleContents> contents;

    /** If the given target can be executed via the wasm executor, return true. */
    static bool can_jit_target(const Target &target);

    /** Compile generated wasm code with a set of externs. */
    static WasmModule compile(
        const Module &module,
        const std::vector<Argument> &arguments,
        const std::string &fn_name,
        const std::map<std::string, JITExtern> &externs,
        const std::vector<JITModule> &extern_deps);

    /** Run generated previously compiled wasm code with a set of arguments. */
    int run(const void *const *args);
};

}  // namespace Internal
}  // namespace Halide

#endif  // HALIDE_WASM_EXECUTOR_H

namespace llvm {
class Module;
}

namespace Halide {

struct ExternCFunction;
struct JITExtern;
class Module;

struct JITUserContext;

/** A set of custom overrides of runtime functions. These only apply
 * when JIT-compiling code. If you are doing AOT compilation, see
 * HalideRuntime.h for instructions on how to replace runtime
 * functions. */
struct JITHandlers {
    /** Set the function called to print messages from the runtime. */
    void (*custom_print)(JITUserContext *, const char *){nullptr};

    /** A custom malloc and free for halide to use. Malloc should
     * return 32-byte aligned chunks of memory, and it should be safe
     * for Halide to read slightly out of bounds (up to 8 bytes before
     * the start or beyond the end). */
    // @{
    void *(*custom_malloc)(JITUserContext *, size_t){nullptr};
    void (*custom_free)(JITUserContext *, void *){nullptr};
    // @}

    /** A custom task handler to be called by the parallel for
     * loop. It is useful to set this if you want to do some
     * additional bookkeeping at the granularity of parallel
     * tasks. The default implementation does this:
     \code
     extern "C" int halide_do_task(JITUserContext *user_context,
                                   int (*f)(void *, int, uint8_t *),
                                   int idx, uint8_t *state) {
         return f(user_context, idx, state);
     }
     \endcode
     *
     * If you're trying to use a custom parallel runtime, you probably
     * don't want to call this. See instead custom_do_par_for.
    */
    int (*custom_do_task)(JITUserContext *, int (*)(JITUserContext *, int, uint8_t *), int, uint8_t *){nullptr};

    /** A custom parallel for loop launcher. Useful if your app
     * already manages a thread pool. The default implementation is
     * equivalent to this:
     \code
     extern "C" int halide_do_par_for(JITUserContext *user_context,
                                      int (*f)(void *, int, uint8_t *),
                                      int min, int extent, uint8_t *state) {
         int exit_status = 0;
         parallel for (int idx = min; idx < min+extent; idx++) {
             int job_status = halide_do_task(user_context, f, idx, state);
             if (job_status) exit_status = job_status;
         }
         return exit_status;
     }
     \endcode
     *
     * However, notwithstanding the above example code, if one task
     * fails, we may skip over other tasks, and if two tasks return
     * different error codes, we may select one arbitrarily to return.
     */
    int (*custom_do_par_for)(JITUserContext *, int (*)(JITUserContext *, int, uint8_t *), int, int, uint8_t *){nullptr};

    /** The error handler function that be called in the case of
     * runtime errors during halide pipelines. */
    void (*custom_error)(JITUserContext *, const char *){nullptr};

    /** A custom routine to call when tracing is enabled. Call this
     * on the output Func of your pipeline. This then sets custom
     * routines for the entire pipeline, not just calls to this
     * Func. */
    int32_t (*custom_trace)(JITUserContext *, const halide_trace_event_t *){nullptr};

    /** A method to use for Halide to resolve symbol names dynamically
     * in the calling process or library from within the Halide
     * runtime. Equivalent to dlsym with a null first argument. */
    void *(*custom_get_symbol)(const char *name){nullptr};

    /** A method to use for Halide to dynamically load libraries from
     * within the runtime. Equivalent to dlopen. Returns a handle to
     * the opened library. */
    void *(*custom_load_library)(const char *name){nullptr};

    /** A method to use for Halide to dynamically find a symbol within
     * an opened library. Equivalent to dlsym. Takes a handle
     * returned by custom_load_library as the first argument. */
    void *(*custom_get_library_symbol)(void *lib, const char *name){nullptr};

    /** A custom method for the Halide runtime acquire a cuda
     * context. The cuda context is treated as a void * to avoid a
     * dependence on the cuda headers. If the create argument is set
     * to true, a context should be created if one does not already
     * exist. */
    int32_t (*custom_cuda_acquire_context)(JITUserContext *user_context, void **cuda_context_ptr, bool create){nullptr};

    /** The Halide runtime calls this when it is done with a cuda
     * context. The default implementation does nothing. */
    int32_t (*custom_cuda_release_context)(JITUserContext *user_context){nullptr};

    /** A custom method for the Halide runtime to acquire a cuda
     * stream to use. The cuda context and stream are both modelled
     * as a void *, to avoid a dependence on the cuda headers. */
    int32_t (*custom_cuda_get_stream)(JITUserContext *user_context, void *cuda_context, void **stream_ptr){nullptr};
};

namespace Internal {
struct JITErrorBuffer;
}

/** A context to be passed to Pipeline::realize. Inherit from this to
 * pass your own custom context object. Modify the handlers field to
 * override runtime functions per-call to realize. */
struct JITUserContext {
    Internal::JITErrorBuffer *error_buffer{nullptr};
    JITHandlers handlers;
};

namespace Internal {

class JITModuleContents;
struct LoweredFunc;

struct JITModule {
    IntrusivePtr<JITModuleContents> jit_module;

    struct Symbol {
        void *address = nullptr;
        Symbol() = default;
        explicit Symbol(void *address)
            : address(address) {
        }
    };

    JITModule();
    JITModule(const Module &m, const LoweredFunc &fn,
              const std::vector<JITModule> &dependencies = std::vector<JITModule>());

    /** Take a list of JITExterns and generate trampoline functions
     * which can be called dynamically via a function pointer that
     * takes an array of void *'s for each argument and the return
     * value.
     */
    static JITModule make_trampolines_module(const Target &target,
                                             const std::map<std::string, JITExtern> &externs,
                                             const std::string &suffix,
                                             const std::vector<JITModule> &deps);

    /** The exports map of a JITModule contains all symbols which are
     * available to other JITModules which depend on this one. For
     * runtime modules, this is all of the symbols exported from the
     * runtime. For a JITted Func, it generally only contains the main
     * result Func of the compilation, which takes its name directly
     * from the Func declaration. One can also make a module which
     * contains no code itself but is just an exports maps providing
     * arbitrary pointers to functions or global variables to JITted
     * code. */
    const std::map<std::string, Symbol> &exports() const;

    /** A pointer to the raw halide function. Its true type depends
     * on the Argument vector passed to CodeGen_LLVM::compile. Image
     * parameters become (halide_buffer_t *), and scalar parameters become
     * pointers to the appropriate values. The final argument is a
     * pointer to the halide_buffer_t defining the output. This will be nullptr for
     * a JITModule which has not yet been compiled or one that is not
     * a Halide Func compilation at all. */
    void *main_function() const;

    /** Returns the Symbol structure for the routine documented in
     * main_function. Returning a Symbol allows access to the LLVM
     * type as well as the address. The address and type will be nullptr
     * if the module has not been compiled. */
    Symbol entrypoint_symbol() const;

    /** Returns the Symbol structure for the argv wrapper routine
     * corresponding to the entrypoint. The argv wrapper is callable
     * via an array of void * pointers to the arguments for the
     * call. Returning a Symbol allows access to the LLVM type as well
     * as the address. The address and type will be nullptr if the module
     * has not been compiled. */
    Symbol argv_entrypoint_symbol() const;

    /** A slightly more type-safe wrapper around the raw halide
     * module. Takes it arguments as an array of pointers that
     * correspond to the arguments to \ref main_function . This will
     * be nullptr for a JITModule which has not yet been compiled or one
     * that is not a Halide Func compilation at all. */
    // @{
    typedef int (*argv_wrapper)(const void *const *args);
    argv_wrapper argv_function() const;
    // @}

    /** Add another JITModule to the dependency chain. Dependencies
     * are searched to resolve symbols not found in the current
     * compilation unit while JITting. */
    void add_dependency(JITModule &dep);
    /** Registers a single Symbol as available to modules which depend
     * on this one. The Symbol structure provides both the address and
     * the LLVM type for the function, which allows type safe linkage of
     * extenal routines. */
    void add_symbol_for_export(const std::string &name, const Symbol &extern_symbol);
    /** Registers a single function as available to modules which
     * depend on this one. This routine converts the ExternSignature
     * info into an LLVM type, which allows type safe linkage of
     * external routines. */
    void add_extern_for_export(const std::string &name,
                               const ExternCFunction &extern_c_function);

    /** Look up a symbol by name in this module or its dependencies. */
    Symbol find_symbol_by_name(const std::string &) const;

    /** Take an llvm module and compile it. The requested exports will
        be available via the exports method. */
    void compile_module(std::unique_ptr<llvm::Module> mod,
                        const std::string &function_name, const Target &target,
                        const std::vector<JITModule> &dependencies = std::vector<JITModule>(),
                        const std::vector<std::string> &requested_exports = std::vector<std::string>());

    /** See JITSharedRuntime::memoization_cache_set_size */
    void memoization_cache_set_size(int64_t size) const;

    /** See JITSharedRuntime::memoization_cache_evict */
    void memoization_cache_evict(uint64_t eviction_key) const;

    /** See JITSharedRuntime::reuse_device_allocations */
    void reuse_device_allocations(bool) const;

    /** See JITSharedRuntime::get_num_threads */
    int get_num_threads() const;

    /** See JITSharedRuntime::set_num_threads */
    int set_num_threads(int) const;

    /** Return true if compile_module has been called on this module. */
    bool compiled() const;
};

class JITSharedRuntime {
public:
    // Note only the first llvm::Module passed in here is used. The same shared runtime is used for all JIT.
    static std::vector<JITModule> get(llvm::Module *m, const Target &target, bool create = true);
    static void populate_jit_handlers(JITUserContext *jit_user_context, const JITHandlers &handlers);
    static JITHandlers set_default_handlers(const JITHandlers &handlers);

    /** Set the maximum number of bytes used by memoization caching.
     * If you are compiling statically, you should include HalideRuntime.h
     * and call halide_memoization_cache_set_size() instead.
     */
    static void memoization_cache_set_size(int64_t size);

    /** Evict all cache entries that were tagged with the given
     * eviction_key in the memoize scheduling directive. If you are
     * compiling statically, you should include HalideRuntime.h and
     * call halide_memoization_cache_evict() instead.
     */
    static void memoization_cache_evict(uint64_t eviction_key);

    /** Set whether or not Halide may hold onto and reuse device
     * allocations to avoid calling expensive device API allocation
     * functions. If you are compiling statically, you should include
     * HalideRuntime.h and call halide_reuse_device_allocations
     * instead. */
    static void reuse_device_allocations(bool);

    static void release_all();

    /** Get the number of threads in the Halide thread pool. Includes the
     * calling thread. Meaningless if a custom_do_par_for has been set. */
    static int get_num_threads();

    /** Set the number of threads to use in the Halide thread pool, inclusive of
     * the calling thread. Pass zero to use a reasonable default (typically the
     * number of CPUs online). Calling this is meaningless if custom_do_par_for
     * has been set. Halide may launch more threads than this if necessary to
     * avoid deadlock when using the async scheduling directive. Returns the old
     * number. */
    static int set_num_threads(int);
};

void *get_symbol_address(const char *s);

struct JITCache {
    Target jit_target;
    // Arguments for all inputs and outputs
    std::vector<Argument> arguments;
    std::map<std::string, JITExtern> jit_externs;
    JITModule jit_module;
    WasmModule wasm_module;

    JITCache() = default;
    JITCache(Target jit_target,
             std::vector<Argument> arguments,
             std::map<std::string, JITExtern> jit_externs,
             JITModule jit_module,
             WasmModule wasm_module);

    Target get_compiled_jit_target() const;

    int call_jit_code(const void *const *args);

    void finish_profiling(JITUserContext *context);
};

struct JITErrorBuffer {
    enum { MaxBufSize = 4096 };
    char buf[MaxBufSize];
    std::atomic<size_t> end{0};

    void concat(const char *message);

    std::string str() const;

    static void handler(JITUserContext *ctx, const char *message);
};

struct JITFuncCallContext {
    JITErrorBuffer error_buffer;
    JITUserContext *context;
    bool custom_error_handler;

    JITFuncCallContext(JITUserContext *context, const JITHandlers &pipeline_handlers);

    void finalize(int exit_status);
};

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {

struct Argument;
struct CallableContents;

namespace PythonBindings {
class PyCallable;
}

namespace Internal {

template<typename>
struct IsHalideBuffer : std::false_type {};

template<typename T, int Dims>
struct IsHalideBuffer<::Halide::Buffer<T, Dims>> : std::true_type {};

template<typename T, int Dims>
struct IsHalideBuffer<::Halide::Runtime::Buffer<T, Dims>> : std::true_type {};

template<>
struct IsHalideBuffer<halide_buffer_t *> : std::true_type {};

template<>
struct IsHalideBuffer<const halide_buffer_t *> : std::true_type {};

template<typename>
struct HalideBufferStaticTypeAndDims {
    static constexpr halide_type_t type() {
        return halide_type_t();
    }
    static constexpr int dims() {
        return -1;
    }
};

template<typename T, int Dims>
struct HalideBufferStaticTypeAndDims<::Halide::Buffer<T, Dims>> {
    static constexpr halide_type_t type() {
        if constexpr (std::is_void_v<T>) {
            return halide_type_t();
        } else {
            return halide_type_of<T>();
        }
    }
    static constexpr int dims() {
        return Dims;
    }
};

template<typename T, int Dims>
struct HalideBufferStaticTypeAndDims<::Halide::Runtime::Buffer<T, Dims>> {
    static constexpr halide_type_t type() {
        if constexpr (std::is_void_v<T>) {
            return halide_type_t();
        } else {
            return halide_type_of<T>();
        }
    }
    static constexpr int dims() {
        return Dims;
    }
};

}  // namespace Internal

class Callable {
private:
    friend class Pipeline;
    friend struct CallableContents;
    friend class PythonBindings::PyCallable;

    Internal::IntrusivePtr<CallableContents> contents;

    // ---------------------------------

    // This value is constructed so we can do the necessary runtime check
    // with a single 16-bit compare. It's designed to to the minimal checking
    // necessary to ensure that the arguments are well-formed, but not necessarily
    // "correct"; in particular, it deliberately skips checking type-and-dim
    // of Buffer arguments, since the generated code has assertions to check
    // for that anyway.
    using QuickCallCheckInfo = uint16_t;

    static constexpr QuickCallCheckInfo _make_qcci(uint8_t code, uint8_t bits) {
        return (((uint16_t)code) << 8) | (uint16_t)bits;
    }

    static constexpr QuickCallCheckInfo make_scalar_qcci(halide_type_t t) {
        return _make_qcci(t.code, t.bits);
    }

    static constexpr QuickCallCheckInfo make_buffer_qcci() {
        constexpr uint8_t fake_bits_buffer_cci = 3;
        return _make_qcci(halide_type_handle, fake_bits_buffer_cci);
    }

    static constexpr QuickCallCheckInfo make_ucon_qcci() {
        constexpr uint8_t fake_bits_ucon_cci = 5;
        return _make_qcci(halide_type_handle, fake_bits_ucon_cci);
    }

    template<typename T>
    static constexpr QuickCallCheckInfo make_qcci() {
        using T0 = std::remove_const_t<std::remove_reference_t<T>>;
        if constexpr (std::is_same_v<T0, JITUserContext *>) {
            return make_ucon_qcci();
        } else if constexpr (Internal::IsHalideBuffer<T0>::value) {
            // Don't bother checking type-and-dimensions here (the callee will do that)
            return make_buffer_qcci();
        } else if constexpr (std::is_arithmetic_v<T0> || std::is_pointer_v<T0>) {
            return make_scalar_qcci(halide_type_of<T0>());
        } else {
            // static_assert(false) will fail all the time, even inside constexpr,
            // but gating on sizeof(T) is a nice trick that ensures we will always
            // fail here (since no T is ever size 0).
            static_assert(!sizeof(T), "Illegal type passed to Callable.");
        }
    }

    template<typename... Args>
    static constexpr std::array<QuickCallCheckInfo, sizeof...(Args)> make_qcci_array() {
        return std::array<QuickCallCheckInfo, sizeof...(Args)>{make_qcci<Args>()...};
    }

    // ---------------------------------

    // This value is constructed so we can do a complete type-and-dim check
    // of Buffers, and is used for the make_std_function() method, to ensure
    // that if we specify static type-and-dims for Buffers, the ones we specify
    // actually match the underlying code. We take horrible liberties with halide_type_t
    // to make this happen -- specifically, encoding dimensionality and buffer-vs-scalar
    // into the 'lanes' field -- but that's ok since this never escapes into other usage.
    using FullCallCheckInfo = halide_type_t;

    static constexpr FullCallCheckInfo _make_fcci(halide_type_t type, int dims, bool is_buffer) {
        return type.with_lanes(((uint16_t)dims << 1) | (uint16_t)(is_buffer ? 1 : 0));
    }

    static constexpr FullCallCheckInfo make_scalar_fcci(halide_type_t t) {
        return _make_fcci(t, 0, false);
    }

    static constexpr FullCallCheckInfo make_buffer_fcci(halide_type_t t, int dims) {
        return _make_fcci(t, dims, true);
    }

    static bool is_compatible_fcci(FullCallCheckInfo actual, FullCallCheckInfo expected) {
        if (actual == expected) {
            return true;  // my, that was easy
        }

        // Might still be compatible
        const bool a_is_buffer = (actual.lanes & 1) != 0;
        const int a_dims = (((int16_t)actual.lanes) >> 1);
        const halide_type_t a_type = actual.with_lanes(0);

        const bool e_is_buffer = (expected.lanes & 1) != 0;
        const int e_dims = (((int16_t)expected.lanes) >> 1);
        const halide_type_t e_type = expected.with_lanes(0);

        const bool types_match = (a_type == halide_type_t()) ||
                                 (e_type == halide_type_t()) ||
                                 (a_type == e_type);

        const bool dims_match = a_dims < 0 ||
                                e_dims < 0 ||
                                a_dims == e_dims;

        return a_is_buffer == e_is_buffer && types_match && dims_match;
    }

    template<typename T>
    static constexpr FullCallCheckInfo make_fcci() {
        using T0 = std::remove_const_t<std::remove_reference_t<T>>;
        if constexpr (Internal::IsHalideBuffer<T0>::value) {
            using TypeAndDims = Internal::HalideBufferStaticTypeAndDims<T0>;
            return make_buffer_fcci(TypeAndDims::type(), TypeAndDims::dims());
        } else if constexpr (std::is_arithmetic_v<T0> || std::is_pointer_v<T0>) {
            return make_scalar_fcci(halide_type_of<T0>());
        } else {
            // static_assert(false) will fail all the time, even inside constexpr,
            // but gating on sizeof(T) is a nice trick that ensures we will always
            // fail here (since no T is ever size 0).
            static_assert(!sizeof(T), "Illegal type passed to Callable.");
        }
    }

    template<typename... Args>
    static constexpr std::array<FullCallCheckInfo, sizeof...(Args)> make_fcci_array() {
        return std::array<FullCallCheckInfo, sizeof...(Args)>{make_fcci<Args>()...};
    }

    // ---------------------------------

    template<int Size>
    struct ArgvStorage {
        const void *argv[Size];
        // We need a place to store the scalar inputs, since we need a pointer
        // to them and it's better to avoid relying on stack spill of arguments.
        // Note that this will usually have unused slots, but it's cheap and easy
        // compile-time allocation on the stack.
        uintptr_t argv_scalar_store[Size];

        template<typename... Args>
        explicit ArgvStorage(Args &&...args) {
            fill_slots(0, std::forward<Args>(args)...);
        }

    private:
        template<typename T, int Dims>
        HALIDE_ALWAYS_INLINE void fill_slot(size_t idx, const ::Halide::Buffer<T, Dims> &value) {
            // Don't call ::Halide::Buffer::raw_buffer(): it includes "user_assert(defined())"
            // as part of the wrapper code, and we want this lean-and-mean. Instead, stick in a null
            // value for undefined buffers, and let the Halide pipeline fail with the usual null-ptr
            // check. (Note that H::R::B::get() *never* returns null; you must check defined() first.)
            argv[idx] = value.defined() ? value.get()->raw_buffer() : nullptr;
        }

        template<typename T, int Dims>
        HALIDE_ALWAYS_INLINE void fill_slot(size_t idx, const ::Halide::Runtime::Buffer<T, Dims> &value) {
            argv[idx] = value.raw_buffer();
        }

        HALIDE_ALWAYS_INLINE
        void fill_slot(size_t idx, halide_buffer_t *value) {
            argv[idx] = value;
        }

        HALIDE_ALWAYS_INLINE
        void fill_slot(size_t idx, const halide_buffer_t *value) {
            argv[idx] = value;
        }

        HALIDE_ALWAYS_INLINE
        void fill_slot(size_t idx, JITUserContext *value) {
            auto *dest = &argv_scalar_store[idx];
            *dest = (uintptr_t)value;
            argv[idx] = dest;
        }

        template<typename T>
        HALIDE_ALWAYS_INLINE void fill_slot(size_t idx, const T &value) {
            auto *dest = &argv_scalar_store[idx];
            *(T *)dest = value;
            argv[idx] = dest;
        }

        template<typename T>
        HALIDE_ALWAYS_INLINE void fill_slots(size_t idx, const T &value) {
            fill_slot(idx, value);
        }

        template<typename First, typename Second, typename... Rest>
        HALIDE_ALWAYS_INLINE void fill_slots(int idx, First &&first, Second &&second, Rest &&...rest) {
            fill_slots<First>(idx, std::forward<First>(first));
            fill_slots<Second, Rest...>(idx + 1, std::forward<Second>(second), std::forward<Rest>(rest)...);
        }
    };

    Callable(const std::string &name,
             const JITHandlers &jit_handlers,
             const std::map<std::string, JITExtern> &jit_externs,
             Internal::JITCache &&jit_cache);

    // Note that the first entry in argv must always be a JITUserContext*.
    int call_argv_checked(size_t argc, const void *const *argv, const QuickCallCheckInfo *actual_cci) const;

    using FailureFn = std::function<int(JITUserContext *)>;

    FailureFn do_check_fail(int bad_idx, size_t argc, const char *verb) const;
    FailureFn check_qcci(size_t argc, const QuickCallCheckInfo *actual_cci) const;
    FailureFn check_fcci(size_t argc, const FullCallCheckInfo *actual_cci) const;

    template<typename... Args>
    int call(JITUserContext *context, Args &&...args) const {
        // This is built at compile time!
        static constexpr auto actual_arg_types = make_qcci_array<JITUserContext *, Args...>();

        constexpr size_t count = sizeof...(args) + 1;
        ArgvStorage<count> argv(context, std::forward<Args>(args)...);
        return call_argv_checked(count, &argv.argv[0], actual_arg_types.data());
    }

    /** Return the expected Arguments for this Callable, in the order they must be specified, including all outputs.
     * Note that the first entry will *always* specify a JITUserContext. */
    const std::vector<Argument> &arguments() const;

public:
    /** Construct a default Callable. This is not usable (trying to call it will fail).
     * The defined() method will return false. */
    Callable();

    /** Return true if the Callable is well-defined and usable, false if it is a default-constructed empty Callable. */
    bool defined() const;

    template<typename... Args>
    HALIDE_FUNCTION_ATTRS int
    operator()(JITUserContext *context, Args &&...args) const {
        return call(context, std::forward<Args>(args)...);
    }

    template<typename... Args>
    HALIDE_FUNCTION_ATTRS int
    operator()(Args &&...args) const {
        JITUserContext empty;
        return call(&empty, std::forward<Args>(args)...);
    }

    /** This allows us to construct a std::function<> that wraps the Callable.
     * This is nice in that it is, well, just a std::function, but also in that
     * since the argument-count-and-type checking are baked into the language,
     * we can do the relevant checking only once -- when we first create the std::function --
     * and skip it on all actual *calls* to the function, making it slightly more efficient.
     * It's also more type-forgiving, in that the usual C++ numeric coercion rules apply here.
     *
     * The downside is that there isn't (currently) any way to automatically infer
     * the static types reliably, since we may be using (e.g.) a Param<void>, where the
     * type in question isn't available to the C++ compiler. This means that the coder
     * must supply the correct type signature when calling this function -- but the good news
     * is that if you get it wrong, this function will fail when you call it. (In other words:
     * it can't choose the right thing for you, but it can tell you when you do the wrong thing.)
     *
     * TODO: it's possible that we could infer the correct signatures in some cases,
     * and only fail for the ambiguous cases, but that would require a lot more template-fu
     * here and elsewhere. I think this is good enough for now.
     *
     * TODO: is it possible to annotate the result of a std::function<> with HALIDE_FUNCTION_ATTRS?
     */
    template<typename First, typename... Rest>
    std::function<int(First, Rest...)>
    make_std_function() const {
        if constexpr (std::is_same_v<First, JITUserContext *>) {
            constexpr auto actual_arg_types = make_fcci_array<First, Rest...>();
            const auto failure_fn = check_fcci(actual_arg_types.size(), actual_arg_types.data());
            if (failure_fn) {
                // Return a wrapper for the failure_fn in case the error handler is a no-op,
                // so that subsequent calls won't attempt to use possibly-wrong argv packing.
                return [*this, failure_fn](auto &&first, auto &&...rest) -> int {
                    return failure_fn(std::forward<First>(first));
                };
            }

            // Capture *this to ensure that the CallableContents stay valid as long as the std::function does
            return [*this](auto &&first, auto &&...rest) -> int {
                constexpr size_t count = 1 + sizeof...(rest);
                ArgvStorage<count> argv(std::forward<First>(first), std::forward<Rest>(rest)...);
                return call_argv_fast(count, &argv.argv[0]);
            };
        } else {
            // Explicitly prepend JITUserContext* as first actual-arg-type.
            constexpr auto actual_arg_types = make_fcci_array<JITUserContext *, First, Rest...>();
            const auto failure_fn = check_fcci(actual_arg_types.size(), actual_arg_types.data());
            if (failure_fn) {
                // Return a wrapper for the failure_fn in case the error handler is a no-op,
                // so that subsequent calls won't attempt to use possibly-wrong argv packing.
                return [*this, failure_fn](auto &&first, auto &&...rest) -> int {
                    JITUserContext empty;
                    return failure_fn(&empty);
                };
            }

            // Capture *this to ensure that the CallableContents stay valid as long as the std::function does
            return [*this](auto &&first, auto &&...rest) -> int {
                // Explicitly prepend an (empty) JITUserContext to the args.
                JITUserContext empty;
                constexpr size_t count = 1 + 1 + sizeof...(rest);
                ArgvStorage<count> argv(&empty, std::forward<First>(first), std::forward<Rest>(rest)...);
                return call_argv_fast(count, &argv.argv[0]);
            };
        }
    }

    /** Unsafe low-overhead way of invoking the Callable.
     *
     * This function relies on the same calling convention as the argv-based
     * functions generated for ahead-of-time compiled Halide pilelines.
     *
     * Very rough specifications of the calling convention (but check the source
     * code to be sure):
     *
     *   * Arguments are passed in the same order as they appear in the C
     *     function argument list.
     *   * The first entry in argv must always be a JITUserContext*. Please,
     *     note that this means that argv[0] actually contains JITUserContext**.
     *   * All scalar arguments are passed by pointer, not by value, regardless of size.
     *   * All buffer arguments (input or output) are passed as halide_buffer_t*.
     *
     */
    int call_argv_fast(size_t argc, const void *const *argv) const;
};

}  // namespace Halide

#endif
#ifndef HALIDE_FUNC_H
#define HALIDE_FUNC_H

/** \file
 *
 * Defines Func - the front-end handle on a halide function, and related classes.
 */

#ifndef HALIDE_MODULE_H
#define HALIDE_MODULE_H

/** \file
 *
 * Defines Module, an IR container that fully describes a Halide program.
 */

#include <functional>
#include <map>
#include <memory>
#include <string>

#ifndef HALIDE_FUNCTION_H
#define HALIDE_FUNCTION_H

/** \file
 * Defines the internal representation of a halide function and related classes
 */
#include <map>
#include <string>
#include <utility>
#include <vector>

#ifndef HALIDE_DEFINITION_H
#define HALIDE_DEFINITION_H

/** \file
 * Defines the internal representation of a halide function's definition and related classes
 */

#ifndef HALIDE_SCHEDULE_H
#define HALIDE_SCHEDULE_H

/** \file
 * Defines the internal representation of the schedule for a function
 */

#include <map>
#include <string>
#include <utility>
#include <vector>

#ifndef HALIDE_FUNCTION_PTR_H
#define HALIDE_FUNCTION_PTR_H


namespace Halide {
namespace Internal {

/** Functions are allocated in groups for memory management. Each
 * group has a ref count associated with it. All within-group
 * references must be weak. If there are any references from outside
 * the group, at least one must be strong.  Within-group references
 * may form cycles, but there may not be reference cycles that span
 * multiple groups. These rules are not enforced automatically. */
struct FunctionGroup;

/** The opaque struct describing a Halide function. Wrap it in a
 * Function object to access it. */
struct FunctionContents;

/** A possibly-weak pointer to a Halide function. Take care to follow
 * the rules mentioned above. Preserves weakness/strength on copy.
 *
 * Note that Function objects are always strong pointers to Halide
 * functions.
 */
struct FunctionPtr {
    /** A strong and weak pointer to the group. Only one of these
     * should be non-zero. */
    // @{
    IntrusivePtr<FunctionGroup> strong;
    FunctionGroup *weak = nullptr;
    // @}

    /** The index of the function within the group. */
    int idx = 0;

    /** Get a pointer to the group this Function belongs to. */
    FunctionGroup *group() const {
        return weak ? weak : strong.get();
    }

    /** Get the opaque FunctionContents object this pointer refers
     * to. Wrap it in a Function to do anything interesting with it. */
    // @{
    FunctionContents *get() const;

    FunctionContents &operator*() const {
        return *get();
    }

    FunctionContents *operator->() const {
        return get();
    }
    // @}

    /** Convert from a strong reference to a weak reference. Does
     * nothing if the pointer is undefined, or if the reference is
     * already weak. */
    void weaken() {
        weak = group();
        strong = nullptr;
    }

    /** Convert from a weak reference to a strong reference. Does
     * nothing if the pointer is undefined, or if the reference is
     * already strong. */
    void strengthen() {
        strong = group();
        weak = nullptr;
    }

    /** Check if the reference is defined. */
    bool defined() const {
        return weak || strong.defined();
    }

    /** Check if two FunctionPtrs refer to the same Function. */
    bool same_as(const FunctionPtr &other) const {
        return idx == other.idx && group() == other.group();
    }

    /** Pointer comparison, for using FunctionPtrs as keys in maps and
     * sets. */
    bool operator<(const FunctionPtr &other) const {
        return get() < other.get();
    }
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_LOOP_PARTITIONING_DIRECTIVE_H
#define HALIDE_LOOP_PARTITIONING_DIRECTIVE_H

/** \file
 * Defines the Partition enum.
 */

#include <string>


namespace Halide {

/** Different ways to handle loops with a potentially optimizable boundary conditions. */
enum class Partition {
    /** Automatically let Halide decide on Loop Parititioning. */
    Auto,

    /** Disallow loop partitioning. */
    Never,

    /** Force partitioning of the loop, even in the tail cases of outer
     * partitioned loops. If Halide can't find a way to partition this loop, it
     * will raise an error. */
    Always
};

}  // namespace Halide

#endif  // HALIDE_LOOP_PARTITIONING_DIRECTIVE_H
#ifndef HALIDE_PREFETCH_DIRECTIVE_H
#define HALIDE_PREFETCH_DIRECTIVE_H

/** \file
 * Defines the PrefetchDirective struct
 */

#include <string>


namespace Halide {

/** Different ways to handle accesses outside the original extents in a prefetch. */
enum class PrefetchBoundStrategy {
    /** Clamp the prefetched exprs by intersecting the prefetched region with
     * the original extents. This may make the exprs of the prefetched region
     * more complicated. */
    Clamp,

    /** Guard the prefetch with if-guards that ignores the prefetch if
     * any of the prefetched region ever goes beyond the original extents
     * (i.e. all or nothing). */
    GuardWithIf,

    /** Leave the prefetched exprs as-is (no if-guards around the prefetch
     * and no intersecting with the original extents). This makes the prefetch
     * exprs simpler but this may cause prefetching of the region outside the original
     * extents. This is good if prefetch won't fault when accessing region
     * outside the original extents. */
    NonFaulting
};

namespace Internal {

struct PrefetchDirective {
    std::string name;
    std::string at;    // the loop in which to do the prefetch
    std::string from;  // the loop-var to use as the base for prefetching. It must be nested outside loop_var (or be equal to loop_var).
    Expr offset;       // 'from + offset' will determine the bounds being prefetched.
    PrefetchBoundStrategy strategy;
    // If it's a prefetch load from an image parameter, this points to that.
    Parameter param;
};

}  // namespace Internal

}  // namespace Halide

#endif  // HALIDE_PREFETCH_DIRECTIVE_H

namespace Halide {

class Func;
struct VarOrRVar;

namespace Internal {
class Function;
struct FunctionContents;
struct LoopLevelContents;
}  // namespace Internal

/** Different ways to handle a tail case in a split when the
 * factor does not provably divide the extent. */
enum class TailStrategy {
    /** Round up the extent to be a multiple of the split
     * factor. Not legal for RVars, as it would change the meaning
     * of the algorithm. Pros: generates the simplest, fastest
     * code. Cons: if used on a stage that reads from the input or
     * writes to the output, constrains the input or output size
     * to be a multiple of the split factor. */
    RoundUp,

    /** Guard the inner loop with an if statement that prevents
     * evaluation beyond the original extent. Always legal. The if
     * statement is treated like a boundary condition, and
     * factored out into a loop epilogue if possible. Pros: no
     * redundant re-evaluation; does not constrain input our
     * output sizes. Cons: increases code size due to separate
     * tail-case handling; vectorization will scalarize in the tail
     * case to handle the if statement. */
    GuardWithIf,

    /** Guard the loads and stores in the loop with an if statement
     * that prevents evaluation beyond the original extent. Always
     * legal. The if statement is treated like a boundary condition,
     * and factored out into a loop epilogue if possible.
     * Pros: no redundant re-evaluation; does not constrain input or
     * output sizes. Cons: increases code size due to separate
     * tail-case handling. */
    Predicate,

    /** Guard the loads in the loop with an if statement that
     * prevents evaluation beyond the original extent. Only legal
     * for innermost splits. Not legal for RVars, as it would change
     * the meaning of the algorithm. The if statement is treated like
     * a boundary condition, and factored out into a loop epilogue if
     * possible.
     * Pros: does not constrain input sizes, output size constraints
     * are simpler than full predication. Cons: increases code size
     * due to separate tail-case handling, constrains the output size
     * to be a multiple of the split factor. */
    PredicateLoads,

    /** Guard the stores in the loop with an if statement that
     * prevents evaluation beyond the original extent. Only legal
     * for innermost splits. Not legal for RVars, as it would change
     * the meaning of the algorithm. The if statement is treated like
     * a boundary condition, and factored out into a loop epilogue if
     * possible.
     * Pros: does not constrain output sizes, input size constraints
     * are simpler than full predication. Cons: increases code size
     * due to separate tail-case handling, constraints the input size
     * to be a multiple of the split factor.. */
    PredicateStores,

    /** Prevent evaluation beyond the original extent by shifting
     * the tail case inwards, re-evaluating some points near the
     * end. Only legal for pure variables in pure definitions. If
     * the inner loop is very simple, the tail case is treated
     * like a boundary condition and factored out into an
     * epilogue.
     *
     * This is a good trade-off between several factors. Like
     * RoundUp, it supports vectorization well, because the inner
     * loop is always a fixed size with no data-dependent
     * branching. It increases code size slightly for inner loops
     * due to the epilogue handling, but not for outer loops
     * (e.g. loops over tiles). If used on a stage that reads from
     * an input or writes to an output, this stategy only requires
     * that the input/output extent be at least the split factor,
     * instead of a multiple of the split factor as with RoundUp. */
    ShiftInwards,

    /** Equivalent to ShiftInwards, but protects values that would be
     * re-evaluated by loading the memory location that would be stored to,
     * modifying only the elements not contained within the overlap, and then
     * storing the blended result.
     *
     * This tail strategy is useful when you want to use ShiftInwards to
     * vectorize without a scalar tail, but are scheduling a stage where that
     * isn't legal (e.g. an update definition).
     *
     * Because this is a read - modify - write, this tail strategy cannot be
     * used on any dimension the stage is parallelized over as it would cause a
     * race condition.
     */
    ShiftInwardsAndBlend,

    /** Equivalent to RoundUp, but protected values that would be written beyond
     * the end by loading the memory location that would be stored to,
     * modifying only the elements within the region being computed, and then
     * storing the blended result.
     *
     * This tail strategy is useful when vectorizing an update to some sub-region
     * of a larger Func. As with ShiftInwardsAndBlend, it can't be combined with
     * parallelism.
     */
    RoundUpAndBlend,

    /** For pure definitions use ShiftInwards. For pure vars in
     * update definitions use RoundUp. For RVars in update
     * definitions use GuardWithIf. */
    Auto
};

/** Different ways to handle the case when the start/end of the loops of stages
 * computed with (fused) are not aligned. */
enum class LoopAlignStrategy {
    /** Shift the start of the fused loops to align. */
    AlignStart,

    /** Shift the end of the fused loops to align. */
    AlignEnd,

    /** compute_with will make no attempt to align the start/end of the
     * fused loops. */
    NoAlign,

    /** By default, LoopAlignStrategy is set to NoAlign. */
    Auto
};

/** A reference to a site in a Halide statement at the top of the
 * body of a particular for loop. Evaluating a region of a halide
 * function is done by generating a loop nest that spans its
 * dimensions. We schedule the inputs to that function by
 * recursively injecting realizations for them at particular sites
 * in this loop nest. A LoopLevel identifies such a site. The site
 * can either be a loop nest within all stages of a function
 * or it can refer to a loop nest within a particular function's
 * stage (initial definition or updates).
 *
 * Note that a LoopLevel is essentially a pointer to an underlying value;
 * all copies of a LoopLevel refer to the same site, so mutating one copy
 * (via the set() method) will effectively mutate all copies:
 \code
 Func f;
 Var x;
 LoopLevel a(f, x);
 // Both a and b refer to LoopLevel(f, x)
 LoopLevel b = a;
 // Now both a and b refer to LoopLevel::root()
 a.set(LoopLevel::root());
 \endcode
 * This is quite useful when splitting Halide code into utility libraries, as it allows
 * a library to schedule code according to a caller's specifications, even if the caller
 * hasn't fully defined its pipeline yet:
 \code
 Func demosaic(Func input,
              LoopLevel intermed_compute_at,
              LoopLevel intermed_store_at,
              LoopLevel output_compute_at) {
    Func intermed = ...;
    Func output = ...;
    intermed.compute_at(intermed_compute_at).store_at(intermed_store_at);
    output.compute_at(output_compute_at);
    return output;
 }

 void process() {
     // Note that these LoopLevels are all undefined when we pass them to demosaic()
     LoopLevel intermed_compute_at, intermed_store_at, output_compute_at;
     Func input = ...;
     Func demosaiced = demosaic(input, intermed_compute_at, intermed_store_at, output_compute_at);
     Func output = ...;

     // We need to ensure all LoopLevels have a well-defined value prior to lowering:
     intermed_compute_at.set(LoopLevel(output, y));
     intermed_store_at.set(LoopLevel(output, y));
     output_compute_at.set(LoopLevel(output, x));
 }
 \endcode
 */
class LoopLevel {
    Internal::IntrusivePtr<Internal::LoopLevelContents> contents;

    explicit LoopLevel(Internal::IntrusivePtr<Internal::LoopLevelContents> c)
        : contents(std::move(c)) {
    }

public:
    /** Return the index of the function stage associated with this loop level.
     * Asserts if undefined */
    int stage_index() const;

    /** Identify the loop nest corresponding to some dimension of some function */
    // @{
    LoopLevel(const Internal::Function &f, const VarOrRVar &v, int stage_index = -1);
    LoopLevel(const Func &f, const VarOrRVar &v, int stage_index = -1);
    // @}

    /** Construct an undefined LoopLevel. Calling any method on an undefined
     * LoopLevel (other than set()) will assert. */
    LoopLevel();

    /** For deserialization only. */
    LoopLevel(const std::string &func_name, const std::string &var_name,
              bool is_rvar, int stage_index, bool locked = false);

    /** Construct a special LoopLevel value that implies
     * that a function should be inlined away. */
    static LoopLevel inlined();

    /** Construct a special LoopLevel value which represents the
     * location outside of all for loops. */
    static LoopLevel root();

    /** Mutate our contents to match the contents of 'other'. */
    void set(const LoopLevel &other);

    // All the public methods below this point are meant only for internal
    // use by Halide, rather than user code; hence, they are deliberately
    // documented with plain comments (rather than Doxygen) to avoid being
    // present in user documentation.

    // Lock this LoopLevel.
    LoopLevel &lock();

    // Return the Func name. Asserts if the LoopLevel is_root() or is_inlined() or !defined().
    std::string func() const;

    // Return the VarOrRVar. Asserts if the LoopLevel is_root() or is_inlined() or !defined().
    VarOrRVar var() const;

    // Return true iff the LoopLevel is defined. (Only LoopLevels created
    // with the default ctor are undefined.)
    bool defined() const;

    // Test if a loop level corresponds to inlining the function.
    bool is_inlined() const;

    // Test if a loop level is 'root', which describes the site
    // outside of all for loops.
    bool is_root() const;

    // For serialization only. Do not use in other cases.
    int get_stage_index() const;

    // For serialization only. Do not use in other cases.
    std::string func_name() const;

    // For serialization only. Do not use in other cases.
    std::string var_name() const;

    // For serialization only. Do not use in other cases.
    bool is_rvar() const;

    // For serialization only. Do not use in other cases.
    bool locked() const;

    // Return a string of the form func.var -- note that this is safe
    // to call for root or inline LoopLevels, but asserts if !defined().
    std::string to_string() const;

    // Compare this loop level against the variable name of a for
    // loop, to see if this loop level refers to the site
    // immediately inside this loop. Asserts if !defined().
    bool match(const std::string &loop) const;

    bool match(const LoopLevel &other) const;

    // Check if two loop levels are exactly the same.
    bool operator==(const LoopLevel &other) const;

    bool operator!=(const LoopLevel &other) const {
        return !(*this == other);
    }

private:
    void check_defined() const;
    void check_locked() const;
    void check_defined_and_locked() const;
};

struct FuseLoopLevel {
    LoopLevel level;
    /** Contains alignment strategies for the fused dimensions (indexed by the
     * dimension name). If not in the map, use the default alignment strategy
     * to align the fused dimension (see \ref LoopAlignStrategy::Auto).
     */
    std::map<std::string, LoopAlignStrategy> align;

    FuseLoopLevel()
        : level(LoopLevel::inlined().lock()) {
    }
    FuseLoopLevel(const LoopLevel &level, const std::map<std::string, LoopAlignStrategy> &align)
        : level(level), align(align) {
    }
};

namespace Internal {

class IRMutator;
struct ReductionVariable;

struct Split {
    std::string old_var, outer, inner;
    Expr factor;
    bool exact;  // Is it required that the factor divides the extent
                 // of the old var. True for splits of RVars. Forces
                 // tail strategy to be GuardWithIf.
    TailStrategy tail;

    enum SplitType { SplitVar = 0,
                     RenameVar,
                     FuseVars };

    // If split_type is Rename, then this is just a renaming of the
    // old_var to the outer and not a split. The inner var should
    // be ignored, and factor should be one. Renames are kept in
    // the same list as splits so that ordering between them is
    // respected.

    // If split_type is Fuse, then this does the opposite of a
    // split, it joins the outer and inner into the old_var.
    SplitType split_type;
};

/** Each Dim below has a dim_type, which tells you what
 * transformations are legal on it. When you combine two Dims of
 * distinct DimTypes (e.g. with Stage::fuse), the combined result has
 * the greater enum value of the two types. */
enum class DimType {
    /** This dim originated from a Var. You can evaluate a Func at
     * distinct values of this Var in any order over an interval
     * that's at least as large as the interval required. In pure
     * definitions you can even redundantly re-evaluate points. */
    PureVar = 0,

    /** The dim originated from an RVar. You can evaluate a Func at
     * distinct values of this RVar in any order (including in
     * parallel) over exactly the interval specified in the
     * RDom. PureRVars can also be reordered arbitrarily in the dims
     * list, as there are no data hazards between the evaluation of
     * the Func at distinct values of the RVar.
     *
     * The most common case where an RVar is considered pure is RVars
     * that are used in a way which obeys all the syntactic
     * constraints that a Var does, e.g:
     *
     \code
     RDom r(0, 100);
     f(r.x) = f(r.x) + 5;
     \endcode
     *
     * Other cases where RVars are pure are where the sites being
     * written to by the Func evaluated at one value of the RVar
     * couldn't possibly collide with the sites being written or read
     * by the Func at a distinct value of the RVar. For example, r.x
     * is pure in the following three definitions:
     *
     \code

     // This definition writes to even coordinates and reads from the
     // same site (which no other value of r.x is writing to) and odd
     // sites (which no other value of r.x is writing to):
     f(2*r.x) = max(f(2*r.x), f(2*r.x + 7));

     // This definition writes to scanline zero and reads from the the
     // same site and scanline one:
     f(r.x, 0) += f(r.x, 1);

     // This definition reads and writes over non-overlapping ranges:
     f(r.x + 100) += f(r.x);
     \endcode
     *
     * To give two counterexamples, r.x is not pure in the following
     * definitions:
     *
     \code
     // The same site is written by distinct values of the RVar
     // (write-after-write hazard):
     f(r.x / 2) += f(r.x);

     // One value of r.x reads from a site that another value of r.x
     // is writing to (read-after-write hazard):
     f(r.x) += f(r.x + 1);
     \endcode
     */
    PureRVar,

    /** The dim originated from an RVar. You must evaluate a Func at
     * distinct values of this RVar in increasing order over precisely
     * the interval specified in the RDom. ImpureRVars may not be
     * reordered with respect to other ImpureRVars.
     *
     * All RVars are impure by default. Those for which we can prove
     * no data hazards exist get promoted to PureRVar. There are two
     * instances in which ImpureRVars may be parallelized or reordered
     * even in the presence of hazards:
     *
     * 1) In the case of an update definition that has been proven to be
     * an associative and commutative reduction, reordering of
     * ImpureRVars is allowed, and parallelizing them is allowed if
     * the update has been made atomic.
     *
     * 2) ImpureRVars can also be reordered and parallelized if
     * Func::allow_race_conditions() has been set. This is the escape
     * hatch for when there are no hazards but the checks above failed
     * to prove that (RDom::where can encode arbitrary facts about
     * non-linear integer arithmetic, which is undecidable), or for
     * when you don't actually care about the non-determinism
     * introduced by data hazards (e.g. in the algorithm HOGWILD!).
     */
    ImpureRVar,
};

/** The Dim struct represents one loop in the schedule's
 * representation of a loop nest. */
struct Dim {
    /** Name of the loop variable */
    std::string var;

    /** How are the loop values traversed (e.g. unrolled, vectorized, parallel) */
    ForType for_type;

    /** On what device does the body of the loop execute (e.g. Host, GPU, Hexagon) */
    DeviceAPI device_api;

    /** The DimType tells us what transformations are legal on this
     * loop (see the DimType enum above). */
    DimType dim_type;

    /** The strategy for loop partitioning. */
    Partition partition_policy;

    /** Can this loop be evaluated in any order (including in
     * parallel)? Equivalently, are there no data hazards between
     * evaluations of the Func at distinct values of this var? */
    bool is_pure() const {
        return (dim_type == DimType::PureVar) || (dim_type == DimType::PureRVar);
    }

    /** Did this loop originate from an RVar (in which case the bounds
     * of the loops are algorithmically meaningful)? */
    bool is_rvar() const {
        return (dim_type == DimType::PureRVar) || (dim_type == DimType::ImpureRVar);
    }

    /** Could multiple iterations of this loop happen at the same
     * time, with reads and writes interleaved in arbitrary ways
     * according to the memory model of the underlying compiler and
     * machine? */
    bool is_unordered_parallel() const {
        return Halide::Internal::is_unordered_parallel(for_type);
    }

    /** Could multiple iterations of this loop happen at the same
     * time? Vectorized and GPULanes loop types are parallel but not
     * unordered, because the loop iterations proceed together in
     * lockstep with some well-defined outcome if there are hazards. */
    bool is_parallel() const {
        return Halide::Internal::is_parallel(for_type);
    }
};

/** A bound on a loop, typically from Func::bound */
struct Bound {
    /** The loop var being bounded */
    std::string var;

    /** Declared min and extent of the loop. min may be undefined if
     * Func::bound_extent was used. */
    Expr min, extent;

    /** If defined, the number of iterations will be a multiple of
     * "modulus", and the first iteration will be at a value congruent
     * to "remainder" modulo "modulus". Set by Func::align_bounds and
     * Func::align_extent. */
    Expr modulus, remainder;
};

/** Properties of one axis of the storage of a Func */
struct StorageDim {
    /** The var in the pure definition corresponding to this axis */
    std::string var;

    /** The bounds allocated (not computed) must be a multiple of
     * "alignment". Set by Func::align_storage. */
    Expr alignment;

    /** The bounds allocated (not computed). Set by Func::bound_storage. */
    Expr bound;

    /** If the Func is explicitly folded along this axis (with
     * Func::fold_storage) this gives the extent of the circular
     * buffer used, and whether it is used in increasing order
     * (fold_forward = true) or decreasing order (fold_forward =
     * false). */
    Expr fold_factor;
    bool fold_forward;
};

/** This represents two stages with fused loop nests from outermost to
 * a specific loop level. The loops to compute func_1(stage_1) are
 * fused with the loops to compute func_2(stage_2) from outermost to
 * loop level var_name and the computation from stage_1 of func_1
 * occurs first.
 */
struct FusedPair {
    std::string func_1;
    std::string func_2;
    size_t stage_1;
    size_t stage_2;
    std::string var_name;

    FusedPair() = default;
    FusedPair(const std::string &f1, size_t s1, const std::string &f2,
              size_t s2, const std::string &var)
        : func_1(f1), func_2(f2), stage_1(s1), stage_2(s2), var_name(var) {
    }

    bool operator==(const FusedPair &other) const {
        return (func_1 == other.func_1) && (func_2 == other.func_2) &&
               (stage_1 == other.stage_1) && (stage_2 == other.stage_2) &&
               (var_name == other.var_name);
    }
    bool operator<(const FusedPair &other) const {
        if (func_1 != other.func_1) {
            return func_1 < other.func_1;
        }
        if (func_2 != other.func_2) {
            return func_2 < other.func_2;
        }
        if (var_name != other.var_name) {
            return var_name < other.var_name;
        }
        if (stage_1 != other.stage_1) {
            return stage_1 < other.stage_1;
        }
        return stage_2 < other.stage_2;
    }
};

struct FuncScheduleContents;
struct StageScheduleContents;
struct FunctionContents;

/** A schedule for a Function of a Halide pipeline. This schedule is
 * applied to all stages of the Function. Right now this interface is
 * basically a struct, offering mutable access to its innards.
 * In the future it may become more encapsulated. */
class FuncSchedule {
    IntrusivePtr<FuncScheduleContents> contents;

public:
    FuncSchedule(IntrusivePtr<FuncScheduleContents> c)
        : contents(std::move(c)) {
    }
    FuncSchedule(const FuncSchedule &other) = default;
    FuncSchedule();

    /** Return a deep copy of this FuncSchedule. It recursively deep copies all
     * called functions, schedules, specializations, and reduction domains. This
     * method takes a map of <old FunctionContents, deep-copied version> as input
     * and would use the deep-copied FunctionContents from the map if exists
     * instead of creating a new deep-copy to avoid creating deep-copies of the
     * same FunctionContents multiple times.
     */
    FuncSchedule deep_copy(
        std::map<FunctionPtr, FunctionPtr> &copied_map) const;

    /** This flag is set to true if the schedule is memoized. */
    // @{
    bool &memoized();
    bool memoized() const;
    // @}

    /** This flag is set to true if the schedule is memoized and has an attached
     *  eviction key. */
    // @{
    Expr &memoize_eviction_key();
    Expr memoize_eviction_key() const;
    // @}

    /** Is the production of this Function done asynchronously */
    bool &async();
    bool async() const;

    Expr &ring_buffer();
    Expr &ring_buffer() const;

    /** The list and order of dimensions used to store this
     * function. The first dimension in the vector corresponds to the
     * innermost dimension for storage (i.e. which dimension is
     * tightly packed in memory) */
    // @{
    const std::vector<StorageDim> &storage_dims() const;
    std::vector<StorageDim> &storage_dims();
    // @}

    /** The memory type (heap/stack/shared/etc) used to back this Func. */
    // @{
    MemoryType memory_type() const;
    MemoryType &memory_type();
    // @}

    /** You may explicitly bound some of the dimensions of a function,
     * or constrain them to lie on multiples of a given factor. See
     * \ref Func::bound and \ref Func::align_bounds and \ref Func::align_extent. */
    // @{
    const std::vector<Bound> &bounds() const;
    std::vector<Bound> &bounds();
    // @}

    /** You may explicitly specify an estimate of some of the function
     * dimensions. See \ref Func::set_estimate */
    // @{
    const std::vector<Bound> &estimates() const;
    std::vector<Bound> &estimates();
    // @}

    /** Mark calls of a function by 'f' to be replaced with its identity
     * wrapper or clone during the lowering stage. If the string 'f' is empty,
     * it means replace all calls to the function by all other functions
     * (excluding itself) in the pipeline with the global identity wrapper.
     * See \ref Func::in and \ref Func::clone_in for more details. */
    // @{
    const std::map<std::string, Internal::FunctionPtr> &wrappers() const;
    std::map<std::string, Internal::FunctionPtr> &wrappers();
    void add_wrapper(const std::string &f,
                     const Internal::FunctionPtr &wrapper);
    // @}

    /** At what sites should we inject the allocation and the
     * computation of this function? The store_level must be outside
     * of or equal to the compute_level. If the compute_level is
     * inline, the store_level is meaningless. See \ref Func::store_at
     * and \ref Func::compute_at */
    // @{
    const LoopLevel &store_level() const;
    const LoopLevel &compute_level() const;
    const LoopLevel &hoist_storage_level() const;
    LoopLevel &store_level();
    LoopLevel &compute_level();
    LoopLevel &hoist_storage_level();
    // @}

    /** Pass an IRVisitor through to all Exprs referenced in the
     * Schedule. */
    void accept(IRVisitor *) const;

    /** Pass an IRMutator through to all Exprs referenced in the
     * Schedule. */
    void mutate(IRMutator *);
};

/** A schedule for a single stage of a Halide pipeline. Right now this
 * interface is basically a struct, offering mutable access to its
 * innards. In the future it may become more encapsulated. */
class StageSchedule {
    IntrusivePtr<StageScheduleContents> contents;

public:
    StageSchedule(IntrusivePtr<StageScheduleContents> c)
        : contents(std::move(c)) {
    }
    StageSchedule(const StageSchedule &other) = default;
    StageSchedule();
    StageSchedule(const std::vector<ReductionVariable> &rvars, const std::vector<Split> &splits,
                  const std::vector<Dim> &dims, const std::vector<PrefetchDirective> &prefetches,
                  const FuseLoopLevel &fuse_level, const std::vector<FusedPair> &fused_pairs,
                  bool touched, bool allow_race_conditions, bool atomic, bool override_atomic_associativity_test);

    /** Return a copy of this StageSchedule. */
    StageSchedule get_copy() const;

    /** This flag is set to true if the dims list has been manipulated
     * by the user (or if a ScheduleHandle was created that could have
     * been used to manipulate it). It controls the warning that
     * occurs if you schedule the vars of the pure step but not the
     * update steps. */
    // @{
    bool &touched();
    bool touched() const;
    // @}

    /** RVars of reduction domain associated with this schedule if there is any. */
    // @{
    const std::vector<ReductionVariable> &rvars() const;
    std::vector<ReductionVariable> &rvars();
    // @}

    /** The traversal of the domain of a function can have some of its
     * dimensions split into sub-dimensions. See \ref Func::split */
    // @{
    const std::vector<Split> &splits() const;
    std::vector<Split> &splits();
    // @}

    /** The list and ordering of dimensions used to evaluate this
     * function, after all splits have taken place. The first
     * dimension in the vector corresponds to the innermost for loop,
     * and the last is the outermost. Also specifies what type of for
     * loop to use for each dimension. Does not specify the bounds on
     * each dimension. These get inferred from how the function is
     * used, what the splits are, and any optional bounds in the list below. */
    // @{
    const std::vector<Dim> &dims() const;
    std::vector<Dim> &dims();
    // @}

    /** You may perform prefetching in some of the dimensions of a
     * function. See \ref Func::prefetch */
    // @{
    const std::vector<PrefetchDirective> &prefetches() const;
    std::vector<PrefetchDirective> &prefetches();
    // @}

    /** Innermost loop level of fused loop nest for this function stage.
     * Fusion runs from outermost to this loop level. The stages being fused
     * should not have producer/consumer relationship. See \ref Func::compute_with
     * and \ref Func::compute_with */
    // @{
    const FuseLoopLevel &fuse_level() const;
    FuseLoopLevel &fuse_level();
    // @}

    /** List of function stages that are to be fused with this function stage
     * from the outermost loop to a certain loop level. Those function stages
     * are to be computed AFTER this function stage at the last fused loop level.
     * This list is populated when realization_order() is called. See
     * \ref Func::compute_with */
    // @{
    const std::vector<FusedPair> &fused_pairs() const;
    std::vector<FusedPair> &fused_pairs();

    /** Are race conditions permitted? */
    // @{
    bool allow_race_conditions() const;
    bool &allow_race_conditions();
    // @}

    /** Use atomic update? */
    // @{
    bool atomic() const;
    bool &atomic();
    // @}

    /** Atomic updates are only allowed on associative reductions.
     *  We try to prove the associativity, but the user can override
     *  the associativity test and suppress compiler error if the prover
     *  fails to recognize the associativity or the user does not care. */
    // @{
    bool override_atomic_associativity_test() const;
    bool &override_atomic_associativity_test();
    // @}

    /** Pass an IRVisitor through to all Exprs referenced in the
     * Schedule. */
    void accept(IRVisitor *) const;

    /** Pass an IRMutator through to all Exprs referenced in the
     * Schedule. */
    void mutate(IRMutator *);
};

}  // namespace Internal
}  // namespace Halide

#endif

#include <map>

namespace Halide {

namespace Internal {
struct DefinitionContents;
struct FunctionContents;
class ReductionDomain;
}  // namespace Internal

namespace Internal {

class IRVisitor;
class IRMutator;
struct Specialization;

/** A Function definition which can either represent a init or an update
 * definition. A function may have different definitions due to specialization,
 * which are stored in 'specializations' (Not possible from the front-end, but
 * some scheduling directives may potentially cause this divergence to occur).
 * Although init definition may have multiple values (RHS) per specialization, it
 * must have the same LHS (i.e. same pure dimension variables). The update
 * definition, on the other hand, may have different LHS/RHS per specialization.
 * Note that, while the Expr in LHS/RHS may be different across specializations,
 * they must have the same number of dimensions and the same pure dimensions.
 */
class Definition {

    IntrusivePtr<DefinitionContents> contents;

public:
    /** Construct a Definition from an existing DefinitionContents pointer. Must be non-null */
    explicit Definition(const IntrusivePtr<DefinitionContents> &);

    /** Construct a Definition with the supplied args, values, and reduction domain. */
    Definition(const std::vector<Expr> &args, const std::vector<Expr> &values,
               const ReductionDomain &rdom, bool is_init);

    /** Construct a Definition with deserialized data. */
    Definition(bool is_init, const Expr &predicate, const std::vector<Expr> &args, const std::vector<Expr> &values,
               const StageSchedule &schedule, const std::vector<Specialization> &specializations);

    /** Construct an undefined Definition object. */
    Definition();

    /** Return a copy of this Definition. */
    Definition get_copy() const;

    /** Equality of identity */
    bool same_as(const Definition &other) const {
        return contents.same_as(other.contents);
    }

    /** Definition objects are nullable. Does this definition exist? */
    bool defined() const;

    /** Is this an init definition; otherwise it's an update definition */
    bool is_init() const;

    /** Pass an IRVisitor through to all Exprs referenced in the
     * definition. */
    void accept(IRVisitor *) const;

    /** Pass an IRMutator through to all Exprs referenced in the
     * definition. */
    void mutate(IRMutator *);

    /** Get the default (no-specialization) arguments (left-hand-side) of the definition.
     *
     * Warning: Any Vars in the Exprs are not qualified with the Func name, so
     * the Exprs may contain names which collide with names provided by
     * unique_name.
     */
    // @{
    const std::vector<Expr> &args() const;
    std::vector<Expr> &args();
    // @}

    /** Get the default (no-specialization) right-hand-side of the definition.
     *
     * Warning: Any Vars in the Exprs are not qualified with the Func name, so
     * the Exprs may contain names which collide with names provided by
     * unique_name.
     */
    // @{
    const std::vector<Expr> &values() const;
    std::vector<Expr> &values();
    // @}

    /** Get the predicate on the definition */
    // @{
    const Expr &predicate() const;
    Expr &predicate();
    // @}

    /** Split predicate into vector of ANDs. If there is no predicate (i.e. this
     * definition is always valid), this returns an empty vector. */
    std::vector<Expr> split_predicate() const;

    /** Get the default (no-specialization) stage-specific schedule associated
     * with this definition. */
    // @{
    const StageSchedule &schedule() const;
    StageSchedule &schedule();
    // @}

    /** You may create several specialized versions of a func with
     * different stage-specific schedules. They trigger when the condition is
     * true. See \ref Func::specialize */
    // @{
    const std::vector<Specialization> &specializations() const;
    std::vector<Specialization> &specializations();
    const Specialization &add_specialization(Expr condition);
    // @}
};

struct Specialization {
    Expr condition;
    Definition definition;
    std::string failure_message;  // If non-empty, this specialization always assert-fails with this message.
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_REDUCTION_H
#define HALIDE_REDUCTION_H

/** \file
 * Defines internal classes related to Reduction Domains
 */


namespace Halide {
namespace Internal {

class IRMutator;

/** A single named dimension of a reduction domain */
struct ReductionVariable {
    /**
     * A variable name for the reduction variable. This name must be a
     * valid Var name, i.e. it must not contain a <tt>.</tt> character.
     */
    std::string var;

    Expr min, extent;

    /** This lets you use a ReductionVariable as a key in a map of the form
     * map<ReductionVariable, Foo, ReductionVariable::Compare> */
    struct Compare {
        bool operator()(const ReductionVariable &a, const ReductionVariable &b) const {
            return a.var < b.var;
        }
    };
};

struct ReductionDomainContents;

/** A reference-counted handle on a reduction domain, which is just a
 * vector of ReductionVariable. */
class ReductionDomain {
    IntrusivePtr<ReductionDomainContents> contents;

public:
    /** This lets you use a ReductionDomain as a key in a map of the form
     * map<ReductionDomain, Foo, ReductionDomain::Compare> */
    struct Compare {
        bool operator()(const ReductionDomain &a, const ReductionDomain &b) const {
            internal_assert(a.contents.defined() && b.contents.defined());
            return a.contents < b.contents;
        }
    };

    /** Construct a new nullptr reduction domain */
    ReductionDomain()
        : contents(nullptr) {
    }

    /** Construct a reduction domain that spans the outer product of
     * all values of the given ReductionVariable in scanline order,
     * with the start of the vector being innermost, and the end of
     * the vector being outermost. */
    ReductionDomain(const std::vector<ReductionVariable> &domain);

    /** Construct a reduction domain from deserialization */
    ReductionDomain(const std::vector<ReductionVariable> &domain, const Expr &predicate, bool frozen);

    /** Return a deep copy of this ReductionDomain. */
    ReductionDomain deep_copy() const;

    /** Is this handle non-nullptr */
    bool defined() const {
        return contents.defined();
    }

    /** Tests for equality of reference. Only one reduction domain is
     * allowed per reduction function, and this is used to verify
     * that */
    bool same_as(const ReductionDomain &other) const {
        return contents.same_as(other.contents);
    }

    /** Immutable access to the reduction variables. */
    const std::vector<ReductionVariable> &domain() const;

    /** Add predicate to the reduction domain. See \ref RDom::where
     * for more details. */
    void where(Expr predicate);

    /** Return the predicate defined on this reduction domain. */
    Expr predicate() const;

    /** Set the predicate, replacing any previously set predicate. */
    void set_predicate(const Expr &);

    /** Split predicate into vector of ANDs. If there is no predicate (i.e. all
     * iteration domain in this reduction domain is valid), this returns an
     * empty vector. */
    std::vector<Expr> split_predicate() const;

    /** Mark RDom as frozen, which means it cannot accept new predicates. An
     * RDom is frozen once it is used in a Func's update definition. */
    void freeze();

    /** Check if a RDom has been frozen. If so, it is an error to add new
     * predicates. */
    bool frozen() const;

    /** Pass an IRVisitor through to all Exprs referenced in the
     * ReductionDomain. */
    void accept(IRVisitor *) const;

    /** Pass an IRMutator through to all Exprs referenced in the
     * ReductionDomain. */
    void mutate(IRMutator *);
};

void split_predicate_test();

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {

struct ExternFuncArgument;
class Parameter;
class Tuple;
class Var;

/** An enum to specify calling convention for extern stages. */
enum class NameMangling {
    Default,    ///< Match whatever is specified in the Target
    C,          ///< No name mangling
    CPlusPlus,  ///< C++ name mangling
};

namespace Internal {

struct Call;

/** A reference-counted handle to Halide's internal representation of
 * a function. Similar to a front-end Func object, but with no
 * syntactic sugar to help with definitions. */
class Function {
    FunctionPtr contents;

public:
    /** This lets you use a Function as a key in a map of the form
     * map<Function, Foo, Function::Compare> */
    struct Compare {
        bool operator()(const Function &a, const Function &b) const {
            internal_assert(a.contents.defined() && b.contents.defined());
            return a.contents < b.contents;
        }
    };

    /** Construct a new function with no definitions and no name. This
     * constructor only exists so that you can make vectors of
     * functions, etc.
     */
    Function() = default;

    /** Construct a new function with the given name */
    explicit Function(const std::string &n);

    /** Construct a new function with the given name,
     * with a requirement that it can only represent Expr(s) of the given type(s),
     * and must have exactly the give nnumber of dimensions.
     * required_types.empty() means there are no constraints on the type(s).
     * required_dims == AnyDims means there are no constraints on the dimensions. */
    explicit Function(const std::vector<Type> &required_types, int required_dims, const std::string &n);

    /** Construct a Function from an existing FunctionContents pointer. Must be non-null */
    explicit Function(const FunctionPtr &);

    /** Update a function with deserialized data */
    void update_with_deserialization(const std::string &name,
                                     const std::string &origin_name,
                                     const std::vector<Halide::Type> &output_types,
                                     const std::vector<Halide::Type> &required_types,
                                     int required_dims,
                                     const std::vector<std::string> &args,
                                     const FuncSchedule &func_schedule,
                                     const Definition &init_def,
                                     const std::vector<Definition> &updates,
                                     const std::string &debug_file,
                                     const std::vector<Parameter> &output_buffers,
                                     const std::vector<ExternFuncArgument> &extern_arguments,
                                     const std::string &extern_function_name,
                                     NameMangling name_mangling,
                                     DeviceAPI device_api,
                                     const Expr &extern_proxy_expr,
                                     bool trace_loads,
                                     bool trace_stores,
                                     bool trace_realizations,
                                     const std::vector<std::string> &trace_tags,
                                     bool no_profiling,
                                     bool frozen);

    /** Get a handle on the halide function contents that this Function
     * represents. */
    FunctionPtr get_contents() const {
        return contents;
    }

    /** Deep copy this Function into 'copy'. It recursively deep copies all called
     * functions, schedules, update definitions, extern func arguments, specializations,
     * and reduction domains. This method does not deep-copy the Parameter objects.
     * This method also takes a map of <old Function, deep-copied version> as input
     * and would use the deep-copied Function from the map if exists instead of
     * creating a new deep-copy to avoid creating deep-copies of the same Function
     * multiple times. If 'name' is specified, copy's name will be set to that.
     */
    // @{
    void deep_copy(const FunctionPtr &copy, std::map<FunctionPtr, FunctionPtr> &copied_map) const;
    void deep_copy(std::string name, const FunctionPtr &copy,
                   std::map<FunctionPtr, FunctionPtr> &copied_map) const;
    // @}

    /** Add a pure definition to this function. It may not already
     * have a definition. All the free variables in 'value' must
     * appear in the args list. 'value' must not depend on any
     * reduction domain */
    void define(const std::vector<std::string> &args, std::vector<Expr> values);

    /** Add an update definition to this function. It must already have a pure
     * definition but not an update definition, and the length of args must
     * match the length of args used in the pure definition. 'value' may depend
     * on some reduction domain may contain variables from that domain as well
     * as pure variables. A reduction domain may also be introduced by passing
     * it as the last argument. Any pure variables must also appear as Variables
     * in the args array, and they must have the same name as the pure
     * definition's argument in the same index. */
    void define_update(const std::vector<Expr> &args, std::vector<Expr> values, const ReductionDomain &rdom = ReductionDomain{});

    /** Accept a visitor to visit all of the definitions and arguments
     * of this function. */
    void accept(IRVisitor *visitor) const;

    /** Accept a mutator to mutator all of the definitions and
     * arguments of this function. */
    void mutate(IRMutator *mutator);

    /** Get the name of the function. */
    const std::string &name() const;

    /** If this is a wrapper of another func, created by a chain of in
     * or clone_in calls, returns the name of the original
     * Func. Otherwise returns the name. */
    const std::string &origin_name() const;

    /** Get a mutable handle to the init definition. */
    Definition &definition();

    /** Get the init definition. */
    const Definition &definition() const;

    /** Get the pure arguments. */
    const std::vector<std::string> &args() const;

    /** Get the dimensionality. */
    int dimensions() const;

    /** Get the number of outputs. */
    int outputs() const;

    /** Get the types of the outputs. */
    const std::vector<Type> &output_types() const;

    /** Get the type constaints on the outputs (if any). */
    const std::vector<Type> &required_types() const;

    /** Get the dimensionality constaints on the outputs (if any). */
    int required_dimensions() const;

    /** Get the right-hand-side of the pure definition. Returns an
     * empty vector if there is no pure definition.
     *
     * Warning: Any Vars in the Exprs are not qualified with the Func name, so
     * the Exprs may contain names which collide with names provided by
     * unique_name.
     */
    const std::vector<Expr> &values() const;

    /** Does this function have a pure definition? */
    bool has_pure_definition() const;

    /** Does this function *only* have a pure definition? */
    bool is_pure() const {
        return (has_pure_definition() &&
                !has_update_definition() &&
                !has_extern_definition());
    }

    /** Is it legal to inline this function? */
    bool can_be_inlined() const;

    /** Get a handle to the function-specific schedule for the purpose
     * of modifying it. */
    FuncSchedule &schedule();

    /** Get a const handle to the function-specific schedule for inspecting it. */
    const FuncSchedule &schedule() const;

    /** Get a handle on the output buffer used for setting constraints
     * on it. */
    const std::vector<Parameter> &output_buffers() const;

    /** Get a mutable handle to the stage-specfic schedule for the update
     * stage. */
    StageSchedule &update_schedule(int idx = 0);

    /** Get a mutable handle to this function's update definition at
     * index 'idx'. */
    Definition &update(int idx = 0);

    /** Get a const reference to this function's update definition at
     * index 'idx'. */
    const Definition &update(int idx = 0) const;

    /** Get a const reference to this function's update definitions. */
    const std::vector<Definition> &updates() const;

    /** Does this function have an update definition? */
    bool has_update_definition() const;

    /** Check if the function has an extern definition. */
    bool has_extern_definition() const;

    /** Get the name mangling specified for the extern definition. */
    NameMangling extern_definition_name_mangling() const;

    /** Make a call node to the extern definition. An error if the
     * function has no extern definition. */
    Expr make_call_to_extern_definition(const std::vector<Expr> &args,
                                        const Target &t) const;

    /** Get the proxy Expr for the extern stage. This is an expression
     * known to have the same data access pattern as the extern
     * stage. It must touch at least all of the memory that the extern
     * stage does, though it is permissible for it to be conservative
     * and touch a superset. For most Functions, including those with
     * extern definitions, this will be an undefined Expr. */
    // @{
    Expr extern_definition_proxy_expr() const;
    Expr &extern_definition_proxy_expr();
    // @}

    /** Add an external definition of this Func. */
    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &args,
                       const std::vector<Type> &types,
                       const std::vector<Var> &dims,
                       NameMangling mangling, DeviceAPI device_api);

    /** Retrive the arguments of the extern definition. */
    // @{
    const std::vector<ExternFuncArgument> &extern_arguments() const;
    std::vector<ExternFuncArgument> &extern_arguments();
    // @}

    /** Get the name of the extern function called for an extern
     * definition. */
    const std::string &extern_function_name() const;

    /** Get the DeviceAPI declared for an extern function. */
    DeviceAPI extern_function_device_api() const;

    /** Test for equality of identity. */
    bool same_as(const Function &other) const {
        return contents.same_as(other.contents);
    }

    /** Get a const handle to the debug filename. */
    const std::string &debug_file() const;

    /** Get a handle to the debug filename. */
    std::string &debug_file();

    /** Use an an extern argument to another function. */
    operator ExternFuncArgument() const;

    /** Tracing calls and accessors, passed down from the Func
     * equivalents. */
    // @{
    void trace_loads();
    void trace_stores();
    void trace_realizations();
    void add_trace_tag(const std::string &trace_tag);
    bool is_tracing_loads() const;
    bool is_tracing_stores() const;
    bool is_tracing_realizations() const;
    const std::vector<std::string> &get_trace_tags() const;
    // @}

    /** Replace this Function's LoopLevels with locked copies that
     * cannot be mutated further. */
    void lock_loop_levels();

    /** Mark the function as too small for meaningful profiling. */
    void do_not_profile();

    /** Check if the function is marked as one that should not be profiled. */
    bool should_not_profile() const;

    /** Mark function as frozen, which means it cannot accept new
     * definitions. */
    void freeze();

    /** Check if a function has been frozen. If so, it is an error to
     * add new definitions. */
    bool frozen() const;

    /** Make a new Function with the same lifetime as this one, and
     * return a strong reference to it. Useful to create Functions which
     * have circular references to this one - e.g. the wrappers
     * produced by Func::in. */
    Function new_function_in_same_group(const std::string &);

    /** Mark calls of this function by 'f' to be replaced with its wrapper
     * during the lowering stage. If the string 'f' is empty, it means replace
     * all calls to this function by all other functions (excluding itself) in
     * the pipeline with the wrapper. This will also freeze 'wrapper' to prevent
     * user from updating the values of the Function it wraps via the wrapper.
     * See \ref Func::in for more details. */
    // @{
    void add_wrapper(const std::string &f, Function &wrapper);
    const std::map<std::string, FunctionPtr> &wrappers() const;
    // @}

    /** Check if a Function is a trivial wrapper around another
     * Function, Buffer, or Parameter. Returns the Call node if it
     * is. Otherwise returns null.
     */
    const Call *is_wrapper() const;

    /** Replace every call to Functions in 'substitutions' keys by all Exprs
     * referenced in this Function to call to their substitute Functions (i.e.
     * the corresponding values in 'substitutions' map). */
    // @{
    Function &substitute_calls(const std::map<FunctionPtr, FunctionPtr> &substitutions);
    Function &substitute_calls(const Function &orig, const Function &substitute);
    // @}

    /** Return true iff the name matches one of the Function's pure args. */
    bool is_pure_arg(const std::string &name) const;

    /** If the Function has type requirements, check that the given argument
     * is compatible with them. If not, assert-fail. (If there are no type requirements, do nothing.) */
    void check_types(const Expr &e) const;
    void check_types(const Tuple &t) const;
    void check_types(const Type &t) const;
    void check_types(const std::vector<Expr> &exprs) const;
    void check_types(const std::vector<Type> &types) const;

    /** If the Function has dimension requirements, check that the given argument
     * is compatible with them. If not, assert-fail. (If there are no dimension requirements, do nothing.) */
    void check_dims(int dims) const;

    /** Define the output buffers. If the Function has types specified, this can be called at
     * any time. If not, it can only be called for a Function with a pure definition. */
    void create_output_buffers(const std::vector<Type> &types, int dims) const;
};

/** Deep copy an entire Function DAG. */
std::pair<std::vector<Function>, std::map<std::string, Function>> deep_copy(
    const std::vector<Function> &outputs,
    const std::map<std::string, Function> &env);

extern std::atomic<int> random_variable_counter;

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_MODULUS_REMAINDER_H
#define HALIDE_MODULUS_REMAINDER_H

/** \file
 * Routines for statically determining what expressions are divisible by.
 */

#include <cstdint>


namespace Halide {

struct Expr;

namespace Internal {

template<typename T>
class Scope;

/** The result of modulus_remainder analysis. These represent strided
 * subsets of the integers. A ModulusRemainder object m represents all
 * integers x such that there exists y such that x == m.modulus * y +
 * m.remainder. Note that under this definition a set containing a
 * single integer (a constant) is represented using a modulus of
 * zero. These sets can be combined with several mathematical
 * operators in the obvious way. E.g. m1 + m2 contains (at least) all
 * integers x1 + x2 such that x1 belongs to m1 and x2 belongs to
 * m2. These combinations are conservative. If some internal math
 * would overflow, it defaults to all of the integers (modulus == 1,
 * remainder == 0). */

struct ModulusRemainder {
    ModulusRemainder() = default;
    ModulusRemainder(int64_t m, int64_t r)
        : modulus(m), remainder(r) {
    }

    int64_t modulus = 1, remainder = 0;

    // Take a conservatively-large union of two sets. Contains all
    // elements from both sets, and maybe some more stuff.
    static ModulusRemainder unify(const ModulusRemainder &a, const ModulusRemainder &b);

    // Take a conservatively-large intersection. Everything in the
    // result is in at least one of the two sets, but not always both.
    static ModulusRemainder intersect(const ModulusRemainder &a, const ModulusRemainder &b);

    bool operator==(const ModulusRemainder &other) const {
        return (modulus == other.modulus) && (remainder == other.remainder);
    }
};

ModulusRemainder operator+(const ModulusRemainder &a, const ModulusRemainder &b);
ModulusRemainder operator-(const ModulusRemainder &a, const ModulusRemainder &b);
ModulusRemainder operator*(const ModulusRemainder &a, const ModulusRemainder &b);
ModulusRemainder operator/(const ModulusRemainder &a, const ModulusRemainder &b);
ModulusRemainder operator%(const ModulusRemainder &a, const ModulusRemainder &b);

ModulusRemainder operator+(const ModulusRemainder &a, int64_t b);
ModulusRemainder operator-(const ModulusRemainder &a, int64_t b);
ModulusRemainder operator*(const ModulusRemainder &a, int64_t b);
ModulusRemainder operator/(const ModulusRemainder &a, int64_t b);
ModulusRemainder operator%(const ModulusRemainder &a, int64_t b);

/** For things like alignment analysis, often it's helpful to know
 * if an integer expression is some multiple of a constant plus
 * some other constant. For example, it is straight-forward to
 * deduce that ((10*x + 2)*(6*y - 3) - 1) is congruent to five
 * modulo six.
 *
 * We get the most information when the modulus is large. E.g. if
 * something is congruent to 208 modulo 384, then we also know it's
 * congruent to 0 mod 8, and we can possibly use it as an index for an
 * aligned load. If all else fails, we can just say that an integer is
 * congruent to zero modulo one.
 */
ModulusRemainder modulus_remainder(const Expr &e);

/** If we have alignment information about external variables, we can
 * let the analysis know about that using this version of
 * modulus_remainder: */
ModulusRemainder modulus_remainder(const Expr &e, const Scope<ModulusRemainder> &scope);

/** Reduce an expression modulo some integer. Returns true and assigns
 * to remainder if an answer could be found. */
///@{
HALIDE_MUST_USE_RESULT bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder);
HALIDE_MUST_USE_RESULT bool reduce_expr_modulo(const Expr &e, int64_t modulus, int64_t *remainder, const Scope<ModulusRemainder> &scope);
///@}

void modulus_remainder_test();

/** The greatest common divisor of two integers. Returns a positive result,
 * unless both args are INT64_MIN. */
int64_t gcd(int64_t, int64_t);

/** The least common multiple of two integers */
int64_t lcm(int64_t, int64_t);

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {

template<typename T, int Dims>
class Buffer;
struct Target;

/** Enums specifying various kinds of outputs that can be produced from a Halide Pipeline. */
enum class OutputFileType {
    assembly,
    bitcode,
    c_header,
    c_source,
    compiler_log,
    cpp_stub,
    featurization,
    function_info_header,
    hlpipe,
    llvm_assembly,
    object,
    python_extension,
    pytorch_wrapper,
    registration,
    schedule,
    static_library,
    stmt,
    conceptual_stmt,
    stmt_html,
    conceptual_stmt_html,
    device_code,
};

/** Type of linkage a function in a lowered Halide module can have.
    Also controls whether auxiliary functions and metadata are generated. */
enum class LinkageType {
    External,              ///< Visible externally.
    ExternalPlusMetadata,  ///< Visible externally. Argument metadata and an argv wrapper are also generated.
    ExternalPlusArgv,      ///< Visible externally. Argv wrapper is generated but *not* argument metadata.
    Internal,              ///< Not visible externally, similar to 'static' linkage in C.
};

namespace Internal {

struct OutputInfo {
    std::string name, extension;

    // `is_multi` indicates how these outputs are generated
    // when using the compile_to_multitarget_xxx() APIs (or via the
    // Generator command-line mode):
    //
    // - If `is_multi` is true, then a separate file of this Output type is
    //   generated for each target in the multitarget (e.g. object files,
    //   assembly files, etc). Each of the files will have a suffix appended
    //   that is based on the specific subtarget.
    //
    // - If `is_multi` is false, then only one file of this Output type
    //   regardless of how many targets are in the multitarget. No additional
    //   suffix will be appended to the filename.
    //
    bool is_multi{false};
};
std::map<OutputFileType, const OutputInfo> get_output_info(const Target &target);

/** Definition of an argument to a LoweredFunc. This is similar to
 * Argument, except it enables passing extra information useful to
 * some targets to LoweredFunc. */
struct LoweredArgument : public Argument {
    /** For scalar arguments, the modulus and remainder of this
     * argument. */
    ModulusRemainder alignment;

    LoweredArgument() = default;
    explicit LoweredArgument(const Argument &arg)
        : Argument(arg) {
    }
    LoweredArgument(const std::string &_name, Kind _kind, const Type &_type, uint8_t _dimensions, const ArgumentEstimates &argument_estimates)
        : Argument(_name, _kind, _type, _dimensions, argument_estimates) {
    }
};

/** Definition of a lowered function. This object provides a concrete
 * mapping between parameters used in the function body and their
 * declarations in the argument list. */
struct LoweredFunc {
    std::string name;

    /** Arguments referred to in the body of this function. */
    std::vector<LoweredArgument> args;

    /** Body of this function. */
    Stmt body;

    /** The linkage of this function. */
    LinkageType linkage;

    /** The name-mangling choice for the function. Defaults to using
     * the Target. */
    NameMangling name_mangling;

    LoweredFunc(const std::string &name,
                const std::vector<LoweredArgument> &args,
                Stmt body,
                LinkageType linkage,
                NameMangling mangling = NameMangling::Default);
    LoweredFunc(const std::string &name,
                const std::vector<Argument> &args,
                Stmt body,
                LinkageType linkage,
                NameMangling mangling = NameMangling::Default);
};

}  // namespace Internal

namespace Internal {
struct ModuleContents;
class CompilerLogger;
}  // namespace Internal

struct AutoSchedulerResults;

using MetadataNameMap = std::map<std::string, std::string>;

/** A halide module. This represents IR containing lowered function
 * definitions and buffers. */
class Module {
    Internal::IntrusivePtr<Internal::ModuleContents> contents;

public:
    Module(const std::string &name, const Target &target, const MetadataNameMap &metadata_name_map = {});

    /** Get the target this module has been lowered for. */
    const Target &target() const;

    /** The name of this module. This is used as the default filename
     * for output operations. */
    const std::string &name() const;

    /** If this Module had an auto-generated schedule, return a read-only pointer
     * to the AutoSchedulerResults. If not, return nullptr. */
    const AutoSchedulerResults *get_auto_scheduler_results() const;

    /** Return whether this module uses strict floating-point anywhere. */
    bool any_strict_float() const;

    /** The declarations contained in this module. */
    // @{
    const std::vector<Buffer<void>> &buffers() const;
    const std::vector<Internal::LoweredFunc> &functions() const;
    std::vector<Internal::LoweredFunc> &functions();
    const std::vector<Module> &submodules() const;
    // @}

    /** Tries to locate the offloaded CUDA PTX assembly contained in this Module.
     * Might return a nullptr in case such buffer is not present in this Module.
     */
    Buffer<> get_cuda_ptx_assembly_buffer() const;

    /**
     * Tries to locate the offloaded (GPU) Device assembly contained in this Module.
     * This can be any of the GPU kernel sources, etc...
     */
    Buffer<> get_device_code_buffer() const;

    /** Return the function with the given name. If no such function
     * exists in this module, assert. */
    Internal::LoweredFunc get_function_by_name(const std::string &name) const;

    /** Add a declaration to this module. */
    // @{
    void append(const Buffer<void> &buffer);
    void append(const Internal::LoweredFunc &function);
    void append(const Module &module);
    // @}

    /** Compile a halide Module to variety of outputs, depending on
     * the fields set in output_files. */
    void compile(const std::map<OutputFileType, std::string> &output_files) const;

    /** Compile a halide Module to in-memory object code. Currently
     * only supports LLVM based compilation, but should be extended to
     * handle source code backends. */
    Buffer<uint8_t> compile_to_buffer() const;

    /** Return a new module with all submodules compiled to buffers on
     * on the result Module. */
    Module resolve_submodules() const;

    /** When generating metadata from this module, remap any occurrences
     * of 'from' into 'to'. */
    void remap_metadata_name(const std::string &from, const std::string &to) const;

    /** Retrieve the metadata name map. */
    MetadataNameMap get_metadata_name_map() const;

    /** Set the AutoSchedulerResults for the Module. It is an error to call this
     * multiple times for a given Module. */
    void set_auto_scheduler_results(const AutoSchedulerResults &results);

    /** Set whether this module uses strict floating-point directives anywhere. */
    void set_any_strict_float(bool any_strict_float);

    /** Remember the Stmt during lowing before device-specific offloading. */
    void set_conceptual_code_stmt(const Internal::Stmt &stmt);

    /** Get the remembered conceptual Stmt, remembered before device-specific offloading. */
    const Internal::Stmt &get_conceptual_stmt() const;
};

/** Link a set of modules together into one module. */
Module link_modules(const std::string &name, const std::vector<Module> &modules);

/** Create an object file containing the Halide runtime for a given target. For
 * use with Target::NoRuntime. Standalone runtimes are only compatible with
 * pipelines compiled by the same build of Halide used to call this function. */
void compile_standalone_runtime(const std::string &object_filename, const Target &t);

/** Create an object and/or static library file containing the Halide runtime
 * for a given target. For use with Target::NoRuntime. Standalone runtimes are
 * only compatible with pipelines compiled by the same build of Halide used to
 * call this function. Return a map with just the actual outputs filled in
 * (typically, OutputFileType::object and/or OutputFileType::static_library).
 */
std::map<OutputFileType, std::string> compile_standalone_runtime(const std::map<OutputFileType, std::string> &output_files, const Target &t);

using ModuleFactory = std::function<Module(const std::string &fn_name, const Target &target)>;
using CompilerLoggerFactory = std::function<std::unique_ptr<Internal::CompilerLogger>(const std::string &fn_name, const Target &target)>;

void compile_multitarget(const std::string &fn_name,
                         const std::map<OutputFileType, std::string> &output_files,
                         const std::vector<Target> &targets,
                         const std::vector<std::string> &suffixes,
                         const ModuleFactory &module_factory,
                         const CompilerLoggerFactory &compiler_logger_factory = nullptr);

}  // namespace Halide

#endif
#ifndef HALIDE_PARAM_H
#define HALIDE_PARAM_H

#include <type_traits>

#ifndef HALIDE_EXTERNFUNCARGUMENT_H
#define HALIDE_EXTERNFUNCARGUMENT_H

/** \file
 * Defines the internal representation of a halide ExternFuncArgument
 */


namespace Halide {

/** An argument to an extern-defined Func. May be a Function, Buffer,
 * ImageParam or Expr. */
struct ExternFuncArgument {
    enum ArgType { UndefinedArg = 0,
                   FuncArg,
                   BufferArg,
                   ExprArg,
                   ImageParamArg };
    ArgType arg_type = UndefinedArg;
    Internal::FunctionPtr func;
    Buffer<> buffer;
    Expr expr;
    Parameter image_param;

    ExternFuncArgument(Internal::FunctionPtr f)
        : arg_type(FuncArg), func(std::move(f)) {
    }

    template<typename T, int Dims>
    ExternFuncArgument(const Buffer<T, Dims> &b)
        : arg_type(BufferArg), buffer(b) {
    }
    ExternFuncArgument(Expr e)
        : arg_type(ExprArg), expr(std::move(e)) {
    }
    ExternFuncArgument(int e)
        : arg_type(ExprArg), expr(e) {
    }
    ExternFuncArgument(float e)
        : arg_type(ExprArg), expr(e) {
    }

    ExternFuncArgument(const Parameter &p)
        : arg_type(ImageParamArg), image_param(p) {
        // Scalar params come in via the Expr constructor.
        internal_assert(p.is_buffer());
    }
    ExternFuncArgument() = default;

    bool is_func() const {
        return arg_type == FuncArg;
    }
    bool is_expr() const {
        return arg_type == ExprArg;
    }
    bool is_buffer() const {
        return arg_type == BufferArg;
    }
    bool is_image_param() const {
        return arg_type == ImageParamArg;
    }
    bool defined() const {
        return arg_type != UndefinedArg;
    }
};

}  // namespace Halide

#endif  // HALIDE_EXTERNFUNCARGUMENT_H
#ifndef HALIDE_IR_H
#define HALIDE_IR_H

/** \file
 * Subtypes for Halide expressions (\ref Halide::Expr) and statements (\ref Halide::Internal::Stmt)
 */

#include <string>
#include <vector>


namespace Halide {
namespace Internal {

class Function;

// The lambda-based IRVisitors and IRMutators use this helper to dispatch
// to multiple lambdas via function overloading (of `operator()`).
template<typename... Ts>
struct LambdaOverloads : Ts... {
    using Ts::operator()...;
    explicit LambdaOverloads(Ts... ts)
        : Ts(std::move(ts))... {
    }
};

/** The actual IR nodes begin here. Remember that all the Expr
 * nodes also have a public "type" property */

/** Cast a node from one type to another. Can't change vector widths. */
struct Cast : public ExprNode<Cast> {
    Expr value;

    static Expr make(Type t, Expr v);

    static const IRNodeType _node_type = IRNodeType::Cast;

    /** Check if the cast is equivalent to a reinterpret. */
    bool is_reinterpret() const {
        return (type.is_int_or_uint() &&
                value.type().is_int_or_uint() &&
                type.bits() == value.type().bits());
    }
};

/** Reinterpret value as another type, without affecting any of the bits
 * (on little-endian systems). */
struct Reinterpret : public ExprNode<Reinterpret> {
    Expr value;

    static Expr make(Type t, Expr v);

    static const IRNodeType _node_type = IRNodeType::Reinterpret;
};

/** The sum of two expressions */
struct Add : public ExprNode<Add> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Add;
};

/** The difference of two expressions */
struct Sub : public ExprNode<Sub> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Sub;
};

/** The product of two expressions */
struct Mul : public ExprNode<Mul> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Mul;
};

/** The ratio of two expressions */
struct Div : public ExprNode<Div> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Div;
};

/** The remainder of a / b. Mostly equivalent to '%' in C, except that
 * the result here is always positive. For floats, this is equivalent
 * to calling fmod. */
struct Mod : public ExprNode<Mod> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Mod;
};

/** The lesser of two values. */
struct Min : public ExprNode<Min> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Min;
};

/** The greater of two values */
struct Max : public ExprNode<Max> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Max;
};

/** Is the first expression equal to the second */
struct EQ : public ExprNode<EQ> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::EQ;
};

/** Is the first expression not equal to the second */
struct NE : public ExprNode<NE> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::NE;
};

/** Is the first expression less than the second. */
struct LT : public ExprNode<LT> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::LT;
};

/** Is the first expression less than or equal to the second. */
struct LE : public ExprNode<LE> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::LE;
};

/** Is the first expression greater than the second. */
struct GT : public ExprNode<GT> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::GT;
};

/** Is the first expression greater than or equal to the second. */
struct GE : public ExprNode<GE> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::GE;
};

/** Logical and - are both expressions true */
struct And : public ExprNode<And> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::And;
};

/** Logical or - is at least one of the expression true */
struct Or : public ExprNode<Or> {
    Expr a, b;

    static Expr make(Expr a, Expr b);

    static const IRNodeType _node_type = IRNodeType::Or;
};

/** Logical not - true if the expression false */
struct Not : public ExprNode<Not> {
    Expr a;

    static Expr make(Expr a);

    static const IRNodeType _node_type = IRNodeType::Not;
};

/** A ternary operator. Evalutes 'true_value' and 'false_value',
 * then selects between them based on 'condition'. Equivalent to
 * the ternary operator in C. */
struct Select : public ExprNode<Select> {
    Expr condition, true_value, false_value;

    static Expr make(Expr condition, Expr true_value, Expr false_value);

    static const IRNodeType _node_type = IRNodeType::Select;
};

/** Load a value from a named symbol if predicate is true. The buffer
 * is treated as an array of the 'type' of this Load node. That is,
 * the buffer has no inherent type. The name may be the name of an
 * enclosing allocation, an input or output buffer, or any other
 * symbol of type Handle(). */
struct Load : public ExprNode<Load> {
    std::string name;

    Expr predicate, index;

    // If it's a load from an image argument or compiled-in constant
    // image, this will point to that
    Buffer<> image;

    // If it's a load from an image parameter, this points to that
    Parameter param;

    // The alignment of the index. If the index is a vector, this is
    // the alignment of the first lane.
    ModulusRemainder alignment;

    static Expr make(Type type, const std::string &name,
                     Expr index, Buffer<> image,
                     Parameter param,
                     Expr predicate,
                     ModulusRemainder alignment);

    static const IRNodeType _node_type = IRNodeType::Load;
};

/** A linear ramp vector node. This is vector with 'lanes' elements,
 * where element i is 'base' + i*'stride'. This is a convenient way to
 * pass around vectors without busting them up into individual
 * elements. E.g. a dense vector load from a buffer can use a ramp
 * node with stride 1 as the index. */
struct Ramp : public ExprNode<Ramp> {
    Expr base, stride;
    int lanes;

    static Expr make(Expr base, Expr stride, int lanes);

    static const IRNodeType _node_type = IRNodeType::Ramp;
};

/** A vector with 'lanes' elements, in which every element is
 * 'value'. This is a special case of the ramp node above, in which
 * the stride is zero. */
struct Broadcast : public ExprNode<Broadcast> {
    Expr value;
    int lanes;

    static Expr make(Expr value, int lanes);

    static const IRNodeType _node_type = IRNodeType::Broadcast;
};

/** A let expression, like you might find in a functional
 * language. Within the expression \ref Let::body, instances of the Var
 * node \ref Let::name refer to \ref Let::value. */
struct Let : public ExprNode<Let> {
    std::string name;
    Expr value, body;

    static Expr make(const std::string &name, Expr value, Expr body);

    static const IRNodeType _node_type = IRNodeType::Let;
};

/** The statement form of a let node. Within the statement 'body',
 * instances of the Var named 'name' refer to 'value' */
struct LetStmt : public StmtNode<LetStmt> {
    std::string name;
    Expr value;
    Stmt body;

    static Stmt make(const std::string &name, Expr value, Stmt body);

    static const IRNodeType _node_type = IRNodeType::LetStmt;
};

/** If the 'condition' is false, then evaluate and return the message,
 * which should be a call to an error function. */
struct AssertStmt : public StmtNode<AssertStmt> {
    // if condition then val else error out with message
    Expr condition;
    Expr message;

    static Stmt make(Expr condition, Expr message);

    static const IRNodeType _node_type = IRNodeType::AssertStmt;
};

/** This node is a helpful annotation to do with permissions. If 'is_produce' is
 * set to true, this represents a producer node which may also contain updates;
 * otherwise, this represents a consumer node. If the producer node contains
 * updates, the body of the node will be a block of 'produce' and 'update'
 * in that order. In a producer node, the access is read-write only (or write
 * only if it doesn't have updates). In a consumer node, the access is read-only.
 * None of this is actually enforced, the node is purely for informative purposes
 * to help out our analysis during lowering. For every unique ProducerConsumer,
 * there is an associated Realize node with the same name that creates the buffer
 * being read from or written to in the body of the ProducerConsumer.
 */
struct ProducerConsumer : public StmtNode<ProducerConsumer> {
    std::string name;
    bool is_producer;
    Stmt body;

    static Stmt make(const std::string &name, bool is_producer, Stmt body);

    static Stmt make_produce(const std::string &name, Stmt body);
    static Stmt make_consume(const std::string &name, Stmt body);

    static const IRNodeType _node_type = IRNodeType::ProducerConsumer;
};

/** Store a 'value' to the buffer called 'name' at a given 'index' if
 * 'predicate' is true. The buffer is interpreted as an array of the
 * same type as 'value'. The name may be the name of an enclosing
 * Allocate node, an output buffer, or any other symbol of type
 * Handle(). */
struct Store : public StmtNode<Store> {
    std::string name;
    Expr predicate, value, index;
    // If it's a store to an output buffer, then this parameter points to it.
    Parameter param;

    // The alignment of the index. If the index is a vector, this is
    // the alignment of the first lane.
    ModulusRemainder alignment;

    static Stmt make(const std::string &name, Expr value, Expr index,
                     Parameter param, Expr predicate, ModulusRemainder alignment);

    static const IRNodeType _node_type = IRNodeType::Store;
};

/** This defines the value of a function at a multi-dimensional
 * location. You should think of it as a store to a multi-dimensional
 * array. It gets lowered to a conventional Store node. The name must
 * correspond to an output buffer or the name of an enclosing Realize
 * node. */
struct Provide : public StmtNode<Provide> {
    std::string name;
    std::vector<Expr> values;
    std::vector<Expr> args;
    Expr predicate;

    static Stmt make(const std::string &name, const std::vector<Expr> &values, const std::vector<Expr> &args, const Expr &predicate);

    static const IRNodeType _node_type = IRNodeType::Provide;
};

/** Allocate a scratch area called with the given name, type, and
 * size. The buffer lives for at most the duration of the body
 * statement, within which it may or may not be freed explicitly with
 * a Free node with a matching name. Allocation only occurs if the
 * condition evaluates to true. Within the body of the allocation,
 * defines a symbol with the given name and the type Handle(). */
struct Allocate : public StmtNode<Allocate> {
    std::string name;
    Type type;
    MemoryType memory_type;
    std::vector<Expr> extents;

    // A boolean condition that determines if the allocation needs to be made at all.
    Expr condition;

    // These override the code generator dependent malloc and free
    // equivalents if provided. If the new_expr succeeds, that is it
    // returns non-nullptr, the function named be free_function is
    // guaranteed to be called. The free function signature must match
    // that of the code generator dependent free (typically
    // halide_free). If free_function is left empty, code generator
    // default will be called.
    Expr new_expr;
    std::string free_function;

    // Extra padding elements to allow for overreads. Elements in the padding
    // have undetermined values, but are guaranteed safe to load.
    int padding;

    Stmt body;

    static Stmt make(const std::string &name, Type type, MemoryType memory_type,
                     const std::vector<Expr> &extents,
                     Expr condition, Stmt body,
                     Expr new_expr = Expr(), const std::string &free_function = std::string(), int padding = 0);

    /** A routine to check if the extents are all constants, and if so verify
     * the total size is less than 2^31 - 1. If the result is constant, but
     * overflows, this routine asserts. This returns 0 if the extents are
     * not all constants; otherwise, it returns the total constant allocation
     * size. Does not include any padding bytes. */
    static int32_t constant_allocation_size(const std::vector<Expr> &extents, const std::string &name);
    int32_t constant_allocation_size() const;

    static const IRNodeType _node_type = IRNodeType::Allocate;
};

/** Free the resources associated with the given buffer. */
struct Free : public StmtNode<Free> {
    std::string name;

    static Stmt make(const std::string &name);

    static const IRNodeType _node_type = IRNodeType::Free;
};

/** Allocate a multi-dimensional buffer of the given type and
 * size. Create some scratch memory that will back the function 'name'
 * over the range specified in 'bounds'. The bounds are a vector of
 * (min, extent) pairs for each dimension. Allocation only occurs if
 * the condition evaluates to true.
 */
struct Realize : public StmtNode<Realize> {
    std::string name;
    std::vector<Type> types;
    MemoryType memory_type;
    Region bounds;
    Expr condition;
    Stmt body;

    static Stmt make(const std::string &name, const std::vector<Type> &types, MemoryType memory_type, const Region &bounds, Expr condition, Stmt body);

    static const IRNodeType _node_type = IRNodeType::Realize;
};

/** A sequence of statements to be executed in-order. 'first' is never
    a Block, so this can be treated as a linked list. */
struct Block : public StmtNode<Block> {
    Stmt first, rest;

    static Stmt make(Stmt first, Stmt rest);

    /** Construct zero or more Blocks to invoke a list of statements in order.
     * This method may not return a Block statement if stmts.size() <= 1. */
    static Stmt make(const std::vector<Stmt> &stmts);

    static const IRNodeType _node_type = IRNodeType::Block;
};

/** A pair of statements executed concurrently. Both statements are
 * joined before the Stmt ends. This is the parallel equivalent to
 * Block. */
struct Fork : public StmtNode<Fork> {
    Stmt first, rest;

    static Stmt make(Stmt first, Stmt rest);

    static const IRNodeType _node_type = IRNodeType::Fork;
};

/** An if-then-else block. 'else' may be undefined. */
struct IfThenElse : public StmtNode<IfThenElse> {
    Expr condition;
    Stmt then_case, else_case;

    static Stmt make(Expr condition, Stmt then_case, Stmt else_case = Stmt());

    static const IRNodeType _node_type = IRNodeType::IfThenElse;
};

/** Evaluate and discard an expression, presumably because it has some side-effect. */
struct Evaluate : public StmtNode<Evaluate> {
    Expr value;

    static Stmt make(Expr v);

    static const IRNodeType _node_type = IRNodeType::Evaluate;
};

/** A function call. This can represent a call to some extern function
 * (like sin), but it's also our multi-dimensional version of a Load,
 * so it can be a load from an input image, or a call to another
 * halide function. These two types of call nodes don't survive all
 * the way down to code generation - the lowering process converts
 * them to Load nodes. */
struct Call : public ExprNode<Call> {
    std::string name;
    std::vector<Expr> args;
    typedef enum { Image,            ///< A load from an input image
                   Extern,           ///< A call to an external C-ABI function, possibly with side-effects
                   ExternCPlusPlus,  ///< A call to an external C-ABI function, possibly with side-effects
                   PureExtern,       ///< A call to a guaranteed-side-effect-free external function
                   Halide,           ///< A call to a Func
                   Intrinsic,        ///< A possibly-side-effecty compiler intrinsic, which has special handling during codegen
                   PureIntrinsic     ///< A side-effect-free version of the above.
    } CallType;
    CallType call_type;

    // Halide uses calls internally to represent certain operations
    // (instead of IR nodes). These are matched by name. Note that
    // these are deliberately char* (rather than std::string) so that
    // they can be referenced at static-initialization time without
    // risking ambiguous initalization order; we use a typedef to simplify
    // declaration.
    typedef const char *const ConstString;

    // enums for various well-known intrinsics. (It is not *required* that all
    // intrinsics have an enum entry here, but as a matter of style, it is recommended.)
    // Note that these are only used in the API; inside the node, they are translated
    // into a name. (To recover the name, call get_intrinsic_name().)
    //
    // Please keep this list sorted alphabetically; the specific enum values
    // are *not* guaranteed to be stable across time.
    enum IntrinsicOp {
        abs,
        absd,
        add_image_checks_marker,
        alloca,
        bitwise_and,
        bitwise_not,
        bitwise_or,
        bitwise_xor,
        bool_to_mask,

        // Bundle multiple exprs together temporarily for analysis (e.g. CSE)
        bundle,
        call_cached_indirect_function,
        cast_mask,

        // Concatenate bits of the args, with least significant bits as the
        // first arg (i.e. little-endian)
        concat_bits,
        count_leading_zeros,
        count_trailing_zeros,
        debug_to_file,
        declare_box_touched,
        div_round_to_zero,
        dynamic_shuffle,

        // Extract some contiguous slice of bits from the argument starting at
        // the nth bit, counting from the least significant bit, with the number
        // of bits determined by the return type.
        extract_bits,
        extract_mask_element,
        get_user_context,
        gpu_thread_barrier,
        halving_add,
        halving_sub,
        hvx_gather,
        hvx_scatter,
        hvx_scatter_acc,
        hvx_scatter_release,
        if_then_else,
        if_then_else_mask,
        image_load,
        image_store,
        lerp,
        likely,
        likely_if_innermost,
        load_typed_struct_member,
        make_struct,
        memoize_expr,
        mod_round_to_zero,
        mul_shift_right,
        mux,
        popcount,
        prefetch,
        profiling_enable_instance_marker,
        promise_clamped,
        random,
        register_destructor,
        require,
        require_mask,
        return_second,
        rewrite_buffer,

        // Round a floating point value to nearest integer, with ties going to even
        round,

        rounding_halving_add,
        rounding_mul_shift_right,
        rounding_shift_left,
        rounding_shift_right,
        saturating_add,
        saturating_sub,
        saturating_cast,
        scatter_gather,
        select_mask,
        shift_left,
        shift_right,
        signed_integer_overflow,
        size_of_halide_buffer_t,

        // Marks the point in lowering where the outermost skip stages checks
        // should be introduced.
        skip_stages_marker,

        // Takes a realization name and a loop variable. Declares that values of
        // the realization that were stored on earlier loop iterations of the
        // given loop are potentially loaded in this loop iteration somewhere
        // after this point. Must occur inside a Realize node and For node of
        // the given names but outside any corresponding ProducerConsumer
        // nodes. Communicates to storage folding that sliding window took
        // place.
        sliding_window_marker,

        // Compute (arg[0] + arg[1]) / 2, assuming arg[0] < arg[1].
        sorted_avg,

        // strict floating point ops. These are floating point ops that we would
        // like to optimize around (or let llvm optimize around) by treating
        // them as reals and ignoring the existence of nan and inf. Using these
        // intrinsics instead prevents any such optimizations.
        strict_add,
        strict_div,
        strict_eq,
        strict_le,
        strict_lt,
        strict_max,
        strict_min,
        strict_mul,
        strict_sub,

        // Convert a list of Exprs to a string
        stringify,

        // Query properties of the compiled-for target (resolved at compile-time)
        target_arch_is,
        target_bits,
        target_has_feature,
        target_natural_vector_size,
        target_os_is,

        undef,
        unreachable,
        unsafe_promise_clamped,

        // One-sided variants of widening_add, widening_mul, and widening_sub.
        // arg[0] + widen(arg[1])
        widen_right_add,
        // arg[0] * widen(arg[1])
        widen_right_mul,
        // arg[0] - widen(arg[1])
        widen_right_sub,

        widening_add,
        widening_mul,
        widening_shift_left,
        widening_shift_right,
        widening_sub,

        get_runtime_vscale,

        IntrinsicOpCount  // Sentinel: keep last.
    };

    static const char *get_intrinsic_name(IntrinsicOp op);

    // We also declare some symbolic names for some of the runtime
    // functions that we want to construct Call nodes to here to avoid
    // magic string constants and the potential risk of typos.
    HALIDE_EXPORT static ConstString
        buffer_get_dimensions,
        buffer_get_min,
        buffer_get_extent,
        buffer_get_stride,
        buffer_get_max,
        buffer_get_host,
        buffer_get_device,
        buffer_get_device_interface,
        buffer_get_shape,
        buffer_get_host_dirty,
        buffer_get_device_dirty,
        buffer_get_type,
        buffer_set_host_dirty,
        buffer_set_device_dirty,
        buffer_is_bounds_query,
        buffer_init,
        buffer_init_from_buffer,
        buffer_crop,
        buffer_set_bounds,
        trace;

    // If it's a call to another halide function, this call node holds
    // a possibly-weak reference to that function.
    FunctionPtr func;

    // If that function has multiple values, which value does this
    // call node refer to?
    int value_index;

    // If it's a call to an image, this call nodes hold a
    // pointer to that image's buffer
    Buffer<> image;

    // If it's a call to an image parameter, this call node holds a
    // pointer to that
    Parameter param;

    static Expr make(Type type, IntrinsicOp op, const std::vector<Expr> &args, CallType call_type,
                     FunctionPtr func = FunctionPtr(), int value_index = 0,
                     const Buffer<> &image = Buffer<>(), Parameter param = Parameter());

    static Expr make(Type type, const std::string &name, const std::vector<Expr> &args, CallType call_type,
                     FunctionPtr func = FunctionPtr(), int value_index = 0,
                     Buffer<> image = Buffer<>(), Parameter param = Parameter());

    /** Convenience constructor for calls to other halide functions */
    static Expr make(const Function &func, const std::vector<Expr> &args, int idx = 0);

    /** Convenience constructor for loads from concrete images */
    static Expr make(const Buffer<> &image, const std::vector<Expr> &args) {
        return make(image.type(), image.name(), args, Image, FunctionPtr(), 0, image, Parameter());
    }

    /** Convenience constructor for loads from images parameters */
    static Expr make(const Parameter &param, const std::vector<Expr> &args) {
        return make(param.type(), param.name(), args, Image, FunctionPtr(), 0, Buffer<>(), param);
    }

    /** Check if a call node is pure within a pipeline, meaning that
     * the same args always give the same result, and the calls can be
     * reordered, duplicated, unified, etc without changing the
     * meaning of anything. Not transitive - doesn't guarantee the
     * args themselves are pure. An example of a pure Call node is
     * sqrt. If in doubt, don't mark a Call node as pure. */
    bool is_pure() const {
        return (call_type == PureExtern ||
                call_type == Image ||
                call_type == PureIntrinsic);
    }

    bool is_intrinsic() const {
        return (call_type == Intrinsic ||
                call_type == PureIntrinsic);
    }

    bool is_intrinsic(IntrinsicOp op) const {
        return is_intrinsic() && this->name == get_intrinsic_name(op);
    }

    bool is_intrinsic(std::initializer_list<IntrinsicOp> intrinsics) const {
        for (IntrinsicOp i : intrinsics) {
            if (is_intrinsic(i)) {
                return true;
            }
        }
        return false;
    }

    bool is_tag() const {
        return is_intrinsic({Call::likely, Call::likely_if_innermost});
    }

    /** Returns a pointer to a call node if the expression is a call to
     * one of the requested intrinsics. */
    static const Call *as_intrinsic(const Expr &e, std::initializer_list<IntrinsicOp> intrinsics) {
        if (const Call *c = e.as<Call>()) {
            for (IntrinsicOp i : intrinsics) {
                if (c->is_intrinsic(i)) {
                    return c;
                }
            }
        }
        return nullptr;
    }

    static const Call *as_tag(const Expr &e) {
        return as_intrinsic(e, {Call::likely, Call::likely_if_innermost});
    }

    bool is_extern() const {
        return (call_type == Extern ||
                call_type == ExternCPlusPlus ||
                call_type == PureExtern);
    }

    bool is_strict_float_intrinsic() const {
        return is_intrinsic(
            {Call::strict_add,
             Call::strict_div,
             Call::strict_max,
             Call::strict_min,
             Call::strict_mul,
             Call::strict_sub,
             Call::strict_lt,
             Call::strict_le,
             Call::strict_eq});
    }

    static const IRNodeType _node_type = IRNodeType::Call;
};

/** A named variable. Might be a loop variable, function argument,
 * parameter, reduction variable, or something defined by a Let or
 * LetStmt node. */
struct Variable : public ExprNode<Variable> {
    std::string name;

    /** References to scalar parameters, or to the dimensions of buffer
     * parameters hang onto those expressions. */
    Parameter param;

    /** References to properties of literal image parameters. */
    Buffer<> image;

    /** Reduction variables hang onto their domains */
    ReductionDomain reduction_domain;

    static Expr make(Type type, const std::string &name) {
        return make(type, name, Buffer<>(), Parameter(), ReductionDomain());
    }

    static Expr make(Type type, const std::string &name, Parameter param) {
        return make(type, name, Buffer<>(), std::move(param), ReductionDomain());
    }

    static Expr make(Type type, const std::string &name, const Buffer<> &image) {
        return make(type, name, image, Parameter(), ReductionDomain());
    }

    static Expr make(Type type, const std::string &name, ReductionDomain reduction_domain) {
        return make(type, name, Buffer<>(), Parameter(), std::move(reduction_domain));
    }

    static Expr make(Type type, const std::string &name, Buffer<> image,
                     Parameter param, ReductionDomain reduction_domain);

    static const IRNodeType _node_type = IRNodeType::Variable;
};

/** A for loop. Execute the 'body' statement for all values of the
 * variable 'name' from 'min' to 'min + extent'. There are four
 * types of For nodes. A 'Serial' for loop is a conventional
 * one. In a 'Parallel' for loop, each iteration of the loop
 * happens in parallel or in some unspecified order. In a
 * 'Vectorized' for loop, each iteration maps to one SIMD lane,
 * and the whole loop is executed in one shot. For this case,
 * 'extent' must be some small integer constant (probably 4, 8, or
 * 16). An 'Unrolled' for loop compiles to a completely unrolled
 * version of the loop. Each iteration becomes its own
 * statement. Again in this case, 'extent' should be a small
 * integer constant. */
struct For : public StmtNode<For> {
    std::string name;
    Expr min, extent;
    ForType for_type;
    DeviceAPI device_api;
    Stmt body;
    Partition partition_policy;

    static Stmt make(const std::string &name,
                     Expr min, Expr extent,
                     ForType for_type, Partition partition_policy,
                     DeviceAPI device_api,
                     Stmt body);

    bool is_unordered_parallel() const {
        return Halide::Internal::is_unordered_parallel(for_type);
    }
    bool is_parallel() const {
        return Halide::Internal::is_parallel(for_type);
    }

    static const IRNodeType _node_type = IRNodeType::For;
};

struct Acquire : public StmtNode<Acquire> {
    Expr semaphore;
    Expr count;
    Stmt body;

    static Stmt make(Expr semaphore, Expr count, Stmt body);

    static const IRNodeType _node_type = IRNodeType::Acquire;
};

/** Construct a new vector by taking elements from another sequence of
 * vectors. */
struct Shuffle : public ExprNode<Shuffle> {
    std::vector<Expr> vectors;

    /** Indices indicating which vector element to place into the
     * result. The elements are numbered by their position in the
     * concatenation of the vector arguments. */
    std::vector<int> indices;

    static Expr make(const std::vector<Expr> &vectors,
                     const std::vector<int> &indices);

    /** Convenience constructor for making a shuffle representing an
     * interleaving of vectors of the same length. */
    static Expr make_interleave(const std::vector<Expr> &vectors);

    /** Convenience constructor for making a shuffle representing a
     * concatenation of the vectors. */
    static Expr make_concat(const std::vector<Expr> &vectors);

    /** Convenience constructor for making a shuffle representing a
     * broadcast of a vector. */
    static Expr make_broadcast(Expr vector, int factor);

    /** Convenience constructor for making a shuffle representing a
     * contiguous subset of a vector. */
    static Expr make_slice(Expr vector, int begin, int stride, int size);

    /** Convenience constructor for making a shuffle representing
     * extracting a single element. */
    static Expr make_extract_element(Expr vector, int i);

    /** Check if this shuffle is an interleaving of the vector
     * arguments. */
    bool is_interleave() const;

    /** Check if this shuffle can be represented as a repeating pattern that
     * repeats the same shuffle of the single input vector some number of times.
     * For example: 0, 3, 1, 1,  0, 3, 1, 1, .....,  0, 3, 1, 1
     */
    bool is_broadcast() const;
    int broadcast_factor() const;

    /** Check if this shuffle is a concatenation of the vector
     * arguments. */
    bool is_concat() const;

    /** Check if this shuffle is a contiguous strict subset of the
     * vector arguments, and if so, the offset and stride of the
     * slice. */
    ///@{
    bool is_slice() const;
    int slice_begin() const {
        return indices[0];
    }
    int slice_stride() const {
        return indices.size() >= 2 ? indices[1] - indices[0] : 1;
    }
    ///@}

    /** Check if this shuffle is extracting a scalar from the vector
     * arguments. */
    bool is_extract_element() const;

    /** Returns the sequence of vector and lane indices that represent each
     * entry to be used for the shuffled vector */
    std::vector<std::pair<int, int>> vector_and_lane_indices() const;

    static const IRNodeType _node_type = IRNodeType::Shuffle;
};

/** Represent a multi-dimensional region of a Func or an ImageParam that
 * needs to be prefetched. */
struct Prefetch : public StmtNode<Prefetch> {
    std::string name;
    std::vector<Type> types;
    Region bounds;
    PrefetchDirective prefetch;
    Expr condition;

    Stmt body;

    static Stmt make(const std::string &name, const std::vector<Type> &types,
                     const Region &bounds,
                     const PrefetchDirective &prefetch,
                     Expr condition, Stmt body);

    static const IRNodeType _node_type = IRNodeType::Prefetch;
};

/**
 * Represents a location where storage will be hoisted to for a Func / Realize
 * node with a given name.
 *
 */
struct HoistedStorage : public StmtNode<HoistedStorage> {
    std::string name;
    Stmt body;

    static Stmt make(const std::string &name,
                     Stmt body);

    static const IRNodeType _node_type = IRNodeType::HoistedStorage;
};

/** Lock all the Store nodes in the body statement.
 *  Typically the lock is implemented by an atomic operation
 *  (e.g. atomic add or atomic compare-and-swap).
 *  However, if necessary, the node can access a mutex buffer through
 *  mutex_name and mutex_args, by lowering this node into
 *  calls to acquire and release the lock. */
struct Atomic : public StmtNode<Atomic> {
    std::string producer_name;
    std::string mutex_name;  // empty string if not using mutex
    Stmt body;

    static Stmt make(const std::string &producer_name,
                     const std::string &mutex_name,
                     Stmt body);

    static const IRNodeType _node_type = IRNodeType::Atomic;
};

/** Horizontally reduce a vector to a scalar or narrower vector using
 * the given commutative and associative binary operator. The reduction
 * factor is dictated by the number of lanes in the input and output
 * types. Groups of adjacent lanes are combined. The number of lanes
 * in the input type must be a divisor of the number of lanes of the
 * output type.  */
struct VectorReduce : public ExprNode<VectorReduce> {
    // 99.9% of the time people will use this for horizontal addition,
    // but these are all of our commutative and associative primitive
    // operators.
    typedef enum {
        Add,
        SaturatingAdd,
        Mul,
        Min,
        Max,
        And,
        Or,
    } Operator;

    Expr value;
    Operator op;

    static Expr make(Operator op, Expr vec, int lanes);

    static const IRNodeType _node_type = IRNodeType::VectorReduce;
};

}  // namespace Internal
}  // namespace Halide

#endif

/** \file
 *
 * Classes for declaring scalar parameters to halide pipelines
 */

namespace Halide {

/** A scalar parameter to a halide pipeline. If you're jitting, this
 * should be bound to an actual value of type T using the set method
 * before you realize the function uses this. If you're statically
 * compiling, this param should appear in the argument list. */
template<typename T = void>
class Param {
    /** A reference-counted handle on the internal parameter object */
    Parameter param;

    // This is a deliberately non-existent type that allows us to compile Param<>
    // but provide less-confusing error messages if you attempt to call get<> or set<>
    // without explicit types.
    struct DynamicParamType;

    /** T unless T is (const) void, in which case pointer-to-useless-type.` */
    using not_void_T = std::conditional_t<std::is_void_v<T>, DynamicParamType *, T>;

    void check_name() const {
        user_assert(param.name() != "__user_context")
            << "Param<void*>(\"__user_context\") "
            << "is no longer used to control whether Halide functions take explicit "
            << "user_context arguments. Use set_custom_user_context() when jitting, "
            << "or add Target::UserContext to the Target feature set when compiling ahead of time.";
    }

    // Allow all Param<> variants friend access to each other
    template<typename OTHER_TYPE>
    friend class Param;

public:
    /** True if the Halide type is not void (or const void). */
    static constexpr bool has_static_type = !std::is_void_v<T>;

    /** Get the Halide type of T. Callers should not use the result if
     * has_static_halide_type is false. */
    static Type static_type() {
        internal_assert(has_static_type);
        return type_of<T>();
    }

    /** Construct a scalar parameter of type T with a unique
     * auto-generated name */
    // @{
    Param()
        : param(type_of<T>(), false, 0, Internal::unique_name('p')) {
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
    }
    explicit Param(Type t)
        : param(t, false, 0, Internal::unique_name('p')) {
        static_assert(!has_static_type, "Cannot use this ctor with an explicit type.");
    }
    // @}

    /** Construct a scalar parameter of type T with the given name. */
    // @{
    explicit Param(const std::string &n)
        : param(type_of<T>(), false, 0, n) {
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
        check_name();
    }
    explicit Param(const char *n)
        : param(type_of<T>(), false, 0, n) {
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
        check_name();
    }
    Param(Type t, const std::string &n)
        : param(t, false, 0, n) {
        static_assert(!has_static_type, "Cannot use this ctor with an explicit type.");
        check_name();
    }
    // @}

    /** Construct a scalar parameter of type T an initial value of
     * 'val'. Only triggers for non-pointer types. */
    template<typename T2 = T, std::enable_if_t<!std::is_pointer_v<T2>> * = nullptr>
    explicit Param(not_void_T val)
        : param(type_of<T>(), false, 0, Internal::unique_name('p')) {
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
        set<not_void_T>(val);
    }

    /** Construct a scalar parameter of type T with the given name
     * and an initial value of 'val'. */
    Param(const std::string &n, not_void_T val)
        : param(type_of<T>(), false, 0, n) {
        check_name();
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
        set<not_void_T>(val);
    }

    /** Construct a scalar parameter of type T with an initial value of 'val'
     * and a given min and max. */
    Param(not_void_T val, const Expr &min, const Expr &max)
        : param(type_of<T>(), false, 0, Internal::unique_name('p')) {
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
        set_range(min, max);
        set<not_void_T>(val);
    }

    /** Construct a scalar parameter of type T with the given name
     * and an initial value of 'val' and a given min and max. */
    Param(const std::string &n, not_void_T val, const Expr &min, const Expr &max)
        : param(type_of<T>(), false, 0, n) {
        static_assert(has_static_type, "Cannot use this ctor without an explicit type.");
        check_name();
        set_range(min, max);
        set<not_void_T>(val);
    }

    /** Construct a Param<void> from any other Param. */
    template<typename OTHER_TYPE, typename T2 = T, std::enable_if_t<std::is_void_v<T2>> * = nullptr>
    Param(const Param<OTHER_TYPE> &other)
        : param(other.param) {
        // empty
    }

    /** Construct a Param<non-void> from a Param with matching type.
     * (Do the check at runtime so that we can assign from Param<void> if the types are compatible.) */
    template<typename OTHER_TYPE, typename T2 = T, std::enable_if_t<!std::is_void_v<T2>> * = nullptr>
    Param(const Param<OTHER_TYPE> &other)
        : param(other.param) {
        user_assert(other.type() == type_of<T>())
            << "Param<" << type_of<T>() << "> cannot be constructed from a Param with type " << other.type();
    }

    /** Copy a Param<void> from any other Param. */
    template<typename OTHER_TYPE, typename T2 = T, std::enable_if_t<std::is_void_v<T2>> * = nullptr>
    Param<T> &operator=(const Param<OTHER_TYPE> &other) {
        param = other.param;
        return *this;
    }

    /** Copy a Param<non-void> from a Param with matching type.
     * (Do the check at runtime so that we can assign from Param<void> if the types are compatible.) */
    template<typename OTHER_TYPE, typename T2 = T, std::enable_if_t<!std::is_void_v<T2>> * = nullptr>
    Param<T> &operator=(const Param<OTHER_TYPE> &other) {
        user_assert(other.type() == type_of<T>())
            << "Param<" << type_of<T>() << "> cannot be copied from a Param with type " << other.type();
        param = other.param;
        return *this;
    }

    /** Get the name of this parameter */
    const std::string &name() const {
        return param.name();
    }

    /** Get the current value of this parameter. Only meaningful when jitting.
        Asserts if type does not exactly match the Parameter's type. */
    template<typename T2 = not_void_T>
    HALIDE_NO_USER_CODE_INLINE T2 get() const {
        return param.scalar<T2>();
    }

    /** Set the current value of this parameter. Only meaningful when jitting.
        Asserts if type is not losslessly-convertible to Parameter's type. */
    template<typename SOME_TYPE>
    HALIDE_NO_USER_CODE_INLINE void set(const SOME_TYPE &val) {
        if constexpr (!std::is_void_v<T>) {
            user_assert(Internal::IsRoundtrippable<T>::value(val))
                << "The value " << val << " cannot be losslessly converted to type " << type();
            param.set_scalar<T>(val);
        } else {

            // Specialized version for when T = void (thus the type is only known at runtime,
            // not compiletime). Note that this actually works fine for all Params; we specialize
            // it just to reduce code size for the common case of T != void.

#define HALIDE_HANDLE_TYPE_DISPATCH(CODE, BITS, TYPE)                                     \
    case halide_type_t(CODE, BITS).as_u32():                                              \
        user_assert(Internal::IsRoundtrippable<TYPE>::value(val))                         \
            << "The value " << val << " cannot be losslessly converted to type " << type; \
        param.set_scalar<TYPE>(Internal::StaticCast<TYPE>::value(val));                   \
        break;

            const Type type = param.type();
            switch (((halide_type_t)type).element_of().as_u32()) {
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_float, 32, float)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_float, 64, double)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 8, int8_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 16, int16_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 32, int32_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 64, int64_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 1, bool)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 8, uint8_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 16, uint16_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 32, uint32_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 64, uint64_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_handle, 64, uint64_t)  // Handle types are always set via set_scalar<uint64_t>, not set_scalar<void*>
            default:
                internal_error << "Unsupported type in Param::set<" << type << ">\n";
            }

#undef HALIDE_HANDLE_TYPE_DISPATCH
        }
    }

    /** Get the halide type of the Param */
    Type type() const {
        return param.type();
    }

    /** Get or set the possible range of this parameter. Use undefined
     * Exprs to mean unbounded. */
    // @{
    void set_range(const Expr &min, const Expr &max) {
        set_min_value(min);
        set_max_value(max);
    }

    void set_min_value(Expr min) {
        if (min.defined() && min.type() != param.type()) {
            min = Internal::Cast::make(param.type(), min);
        }
        param.set_min_value(min);
    }

    void set_max_value(Expr max) {
        if (max.defined() && max.type() != param.type()) {
            max = Internal::Cast::make(param.type(), max);
        }
        param.set_max_value(max);
    }

    Expr min_value() const {
        return param.min_value();
    }

    Expr max_value() const {
        return param.max_value();
    }
    // @}

    template<typename SOME_TYPE>
    HALIDE_NO_USER_CODE_INLINE void set_estimate(const SOME_TYPE &val) {
        if constexpr (!std::is_void_v<T>) {
            user_assert(Internal::IsRoundtrippable<T>::value(val))
                << "The value " << val << " cannot be losslessly converted to type " << type();
            param.set_estimate(Expr(val));
        } else {

            // Specialized version for when T = void (thus the type is only known at runtime,
            // not compiletime). Note that this actually works fine for all Params; we specialize
            // it just to reduce code size for the common case of T != void.

#define HALIDE_HANDLE_TYPE_DISPATCH(CODE, BITS, TYPE)                                     \
    case halide_type_t(CODE, BITS).as_u32():                                              \
        user_assert(Internal::IsRoundtrippable<TYPE>::value(val))                         \
            << "The value " << val << " cannot be losslessly converted to type " << type; \
        param.set_estimate(Expr(Internal::StaticCast<TYPE>::value(val)));                 \
        break;

            const Type type = param.type();
            switch (((halide_type_t)type).element_of().as_u32()) {
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_float, 32, float)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_float, 64, double)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 8, int8_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 16, int16_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 32, int32_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_int, 64, int64_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 1, bool)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 8, uint8_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 16, uint16_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 32, uint32_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_uint, 64, uint64_t)
                HALIDE_HANDLE_TYPE_DISPATCH(halide_type_handle, 64, uint64_t)  // Handle types are always set via set_scalar<uint64_t>, not set_scalar<void*>
            default:
                internal_error << "Unsupported type in Param::set<" << type << ">\n";
            }

#undef HALIDE_HANDLE_TYPE_DISPATCH
        }
    }

    /** You can use this parameter as an expression in a halide
     * function definition */
    operator Expr() const {
        return Internal::Variable::make(param.type(), name(), param);
    }

    /** Using a param as the argument to an external stage treats it
     * as an Expr */
    operator ExternFuncArgument() const {
        return Expr(*this);
    }

    /** Construct the appropriate argument matching this parameter,
     * for the purpose of generating the right type signature when
     * statically compiling halide pipelines. */
    operator Argument() const {
        return Argument(name(), Argument::InputScalar, type(), 0,
                        param.get_argument_estimates());
    }

    const Parameter &parameter() const {
        return param;
    }

    Parameter &parameter() {
        return param;
    }
};

/** Returns an Expr corresponding to the user context passed to
 * the function (if any). It is rare that this function is necessary
 * (e.g. to pass the user context to an extern function written in C). */
inline Expr user_context_value() {
    return Internal::Variable::make(Handle(), "__user_context",
                                    Parameter(Handle(), false, 0, "__user_context"));
}

}  // namespace Halide

#endif
#ifndef HALIDE_PIPELINE_H
#define HALIDE_PIPELINE_H

/** \file
 *
 * Defines the front-end class representing an entire Halide imaging
 * pipeline.
 */

#include <initializer_list>
#include <map>
#include <memory>
#include <vector>

#ifndef HALIDE_IR_OPERATOR_H
#define HALIDE_IR_OPERATOR_H

/** \file
 *
 * Defines various operator overloads and utility functions that make
 * it more pleasant to work with Halide expressions.
 */

#include <cmath>
#include <map>
#include <optional>

#ifndef HALIDE_CONSTANT_INTERVAL_H
#define HALIDE_CONSTANT_INTERVAL_H

#include <stdint.h>

/** \file
 * Defines the ConstantInterval class, and operators on it.
 */

namespace Halide {

struct Type;

namespace Internal {

/** A class to represent ranges of integers. Can be unbounded above or below,
 * but they cannot be empty. */
struct ConstantInterval {
    /** The lower and upper bound of the interval. They are included
     * in the interval. */
    int64_t min = 0, max = 0;
    bool min_defined = false, max_defined = false;

    /* A default-constructed Interval is everything */
    ConstantInterval() = default;

    /** Construct an interval from a lower and upper bound. */
    ConstantInterval(int64_t min, int64_t max);

    /** The interval representing everything. */
    static ConstantInterval everything();

    /** Construct an interval representing a single point. */
    static ConstantInterval single_point(int64_t x);

    /** Construct intervals bounded above or below. */
    static ConstantInterval bounded_below(int64_t min);
    static ConstantInterval bounded_above(int64_t max);

    /** Is the interval the entire range */
    bool is_everything() const;

    /** Is the interval just a single value (min == max) */
    bool is_single_point() const;

    /** Is the interval a particular single value */
    bool is_single_point(int64_t x) const;

    /** Does the interval have a finite upper and lower bound */
    bool is_bounded() const;

    /** Expand the interval to include another Interval */
    void include(const ConstantInterval &i);

    /** Expand the interval to include a point */
    void include(int64_t x);

    /** Test if the interval contains a particular value */
    bool contains(int32_t x) const;

    /** Test if the interval contains a particular value */
    bool contains(int64_t x) const;

    /** Test if the interval contains a particular unsigned value */
    bool contains(uint64_t x) const;

    /** Construct the smallest interval containing two intervals. */
    static ConstantInterval make_union(const ConstantInterval &a, const ConstantInterval &b);

    /** Construct the largest interval contained within two intervals. Throws an
     * error if the interval is empty. */
    static ConstantInterval make_intersection(const ConstantInterval &a, const ConstantInterval &b);

    /** Equivalent to same_as. Exists so that the autoscheduler can
     * compare two map<string, Interval> for equality in order to
     * cache computations. */
    bool operator==(const ConstantInterval &other) const;

    /** In-place versions of the arithmetic operators below. */
    // @{
    void operator+=(const ConstantInterval &other);
    void operator+=(int64_t);
    void operator-=(const ConstantInterval &other);
    void operator-=(int64_t);
    void operator*=(const ConstantInterval &other);
    void operator*=(int64_t);
    void operator/=(const ConstantInterval &other);
    void operator/=(int64_t);
    void operator%=(const ConstantInterval &other);
    void operator%=(int64_t);
    // @}

    /** Negate an interval. */
    ConstantInterval operator-() const;

    /** Track what happens if a constant integer interval is forced to fit into
     * a concrete integer type. */
    void cast_to(const Type &t);

    /** Get constant integer bounds on a type. */
    static ConstantInterval bounds_of_type(Type);
};

/** Arithmetic operators on ConstantIntervals. The resulting interval contains
 * all possible values of the operator applied to any two elements of the
 * argument intervals. Note that these operator on unbounded integers. If you
 * are applying this to concrete small integer types, you will need to manually
 * cast the constant interval back to the desired type to model the effect of
 * overflow. */
// @{
ConstantInterval operator+(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator+(const ConstantInterval &a, int64_t b);
ConstantInterval operator-(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator-(const ConstantInterval &a, int64_t b);
ConstantInterval operator/(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator/(const ConstantInterval &a, int64_t b);
ConstantInterval operator*(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator*(const ConstantInterval &a, int64_t b);
ConstantInterval operator%(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator%(const ConstantInterval &a, int64_t b);
ConstantInterval min(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval min(const ConstantInterval &a, int64_t b);
ConstantInterval max(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval max(const ConstantInterval &a, int64_t b);
ConstantInterval abs(const ConstantInterval &a);
ConstantInterval operator<<(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator<<(const ConstantInterval &a, int64_t b);
ConstantInterval operator<<(int64_t a, const ConstantInterval &b);
ConstantInterval operator>>(const ConstantInterval &a, const ConstantInterval &b);
ConstantInterval operator>>(const ConstantInterval &a, int64_t b);
ConstantInterval operator>>(int64_t a, const ConstantInterval &b);
// @}

/** Comparison operators on ConstantIntervals. Returns whether the comparison is
 * true for all values of the two intervals. */
// @{
bool operator<=(const ConstantInterval &a, const ConstantInterval &b);
bool operator<=(const ConstantInterval &a, int64_t b);
bool operator<=(int64_t a, const ConstantInterval &b);
bool operator<(const ConstantInterval &a, const ConstantInterval &b);
bool operator<(const ConstantInterval &a, int64_t b);
bool operator<(int64_t a, const ConstantInterval &b);

inline bool operator>=(const ConstantInterval &a, const ConstantInterval &b) {
    return b <= a;
}
inline bool operator>(const ConstantInterval &a, const ConstantInterval &b) {
    return b < a;
}
inline bool operator>=(const ConstantInterval &a, int64_t b) {
    return b <= a;
}
inline bool operator>(const ConstantInterval &a, int64_t b) {
    return b < a;
}
inline bool operator>=(int64_t a, const ConstantInterval &b) {
    return b <= a;
}
inline bool operator>(int64_t a, const ConstantInterval &b) {
    return b < a;
}

// @}
}  // namespace Internal

/** Cast operators for ConstantIntervals. These ones have to live out in
 * Halide::, to avoid C++ name lookup confusion with the Halide::cast variants
 * that take Exprs. */
// @{
Internal::ConstantInterval cast(Type t, const Internal::ConstantInterval &a);
Internal::ConstantInterval saturating_cast(Type t, const Internal::ConstantInterval &a);
// @}

}  // namespace Halide

#endif
#ifndef HALIDE_SCOPE_H
#define HALIDE_SCOPE_H

#include <iostream>
#include <map>
#include <stack>
#include <string>
#include <utility>
#include <vector>


/** \file
 * Defines the Scope class, which is used for keeping track of names in a scope while traversing IR
 */

namespace Halide {
namespace Internal {

/** A stack which can store one item very efficiently. Using this
 * instead of std::stack speeds up Scope substantially. */
template<typename T>
class SmallStack {
private:
    T _top;
    std::vector<T> _rest;
    bool _empty = true;

public:
    SmallStack() = default;

    void pop() {
        if (_rest.empty()) {
            _empty = true;
            _top = T();
        } else {
            _top = std::move(_rest.back());
            _rest.pop_back();
        }
    }

    void push(T t) {
        if (!_empty) {
            _rest.push_back(std::move(_top));
        }
        _top = std::move(t);
        _empty = false;
    }

    T top() const {
        return _top;
    }

    T &top_ref() {
        return _top;
    }

    const T &top_ref() const {
        return _top;
    }

    bool empty() const {
        return _empty;
    }

    size_t size() const {
        return _empty ? 0 : (_rest.size() + 1);
    }
};

template<>
class SmallStack<void> {
    // A stack of voids. Voids are all the same, so just record how many voids are in the stack
    int counter = 0;

public:
    void pop() {
        counter--;
    }
    void push() {
        counter++;
    }
    bool empty() const {
        return counter == 0;
    }
};

/** A common pattern when traversing Halide IR is that you need to
 * keep track of stuff when you find a Let or a LetStmt, and that it
 * should hide previous values with the same name until you leave the
 * Let or LetStmt nodes This class helps with that. */
template<typename T = void>
class Scope {
private:
    std::map<std::string, SmallStack<T>> table;

    const Scope<T> *containing_scope = nullptr;

public:
    Scope() = default;
    Scope(Scope &&that) noexcept = default;
    Scope &operator=(Scope &&that) noexcept = default;

    // Copying a scope object copies a large table full of strings and
    // stacks. Bad idea.
    Scope(const Scope<T> &) = delete;
    Scope<T> &operator=(const Scope<T> &) = delete;

    /** Set the parent scope. If lookups fail in this scope, they
     * check the containing scope before returning an error. Caller is
     * responsible for managing the memory of the containing scope. */
    void set_containing_scope(const Scope<T> *s) {
        containing_scope = s;
    }

    /** A const ref to an empty scope. Useful for default function
     * arguments, which would otherwise require a copy constructor
     * (with llvm in c++98 mode) */
    static const Scope<T> &empty_scope() {
        static Scope<T> _empty_scope;
        return _empty_scope;
    }

    /** Retrieve the value referred to by a name */
    template<typename T2 = T,
             typename = std::enable_if_t<!std::is_void_v<T2>>>
    T2 get(const std::string &name) const {
        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
        if (iter == table.end() || iter->second.empty()) {
            if (containing_scope) {
                return containing_scope->get(name);
            } else {
                internal_error << "Name not in Scope: " << name << "\n"
                               << *this << "\n";
            }
        }
        return iter->second.top();
    }

    /** Return a reference to an entry. Does not consider the containing scope. */
    template<typename T2 = T,
             typename = std::enable_if_t<!std::is_void_v<T2>>>
    T2 &ref(const std::string &name) {
        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
        if (iter == table.end() || iter->second.empty()) {
            internal_error << "Name not in Scope: " << name << "\n"
                           << *this << "\n";
        }
        return iter->second.top_ref();
    }

    /** Returns a const pointer to an entry if it exists in this scope or any
     * containing scope, or nullptr if it does not. Use this instead of if
     * (scope.contains(foo)) { ... scope.get(foo) ... } to avoid doing two
     * lookups. */
    template<typename T2 = T,
             typename = std::enable_if_t<!std::is_void_v<T2>>>
    const T2 *find(const std::string &name) const {
        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
        if (iter == table.end() || iter->second.empty()) {
            if (containing_scope) {
                return containing_scope->find(name);
            } else {
                return nullptr;
            }
        }
        return &(iter->second.top_ref());
    }

    /** A version of find that returns a non-const pointer, but ignores
     * containing scope. */
    template<typename T2 = T,
             typename = std::enable_if_t<!std::is_void_v<T2>>>
    T2 *shallow_find(const std::string &name) {
        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
        if (iter == table.end() || iter->second.empty()) {
            return nullptr;
        } else {
            return &(iter->second.top_ref());
        }
    }

    /** Tests if a name is in scope. If you plan to use the value if it is, call
     * find instead. */
    bool contains(const std::string &name) const {
        typename std::map<std::string, SmallStack<T>>::const_iterator iter = table.find(name);
        if (iter == table.end() || iter->second.empty()) {
            if (containing_scope) {
                return containing_scope->contains(name);
            } else {
                return false;
            }
        }
        return true;
    }

    /** How many nested definitions of a single name exist? */
    size_t count(const std::string &name) const {
        auto it = table.find(name);
        if (it == table.end()) {
            return 0;
        } else {
            return it->second.size();
        }
    }

    /** How many distinct names exist (does not count nested definitions of the same name) */
    size_t size() const {
        return table.size();
    }

    struct PushToken {
        typename std::map<std::string, SmallStack<T>>::iterator iter;
    };

    /** Add a new (name, value) pair to the current scope. Hide old values that
     * have this name until we pop this name. Returns a token that can be used
     * to pop the same value without doing a fresh lookup.
     */
    template<typename T2 = T,
             typename = std::enable_if_t<!std::is_void_v<T2>>>
    PushToken push(const std::string &name, T2 &&value) {
        auto it = table.try_emplace(name).first;
        it->second.push(std::forward<T2>(value));
        return PushToken{it};
    }

    template<typename T2 = T,
             typename = std::enable_if_t<std::is_void_v<T2>>>
    PushToken push(const std::string &name) {
        auto it = table.try_emplace(name).first;
        it->second.push();
        return PushToken{it};
    }

    /** A name goes out of scope. Restore whatever its old value
     * was (or remove it entirely if there was nothing else of the
     * same name in an outer scope) */
    void pop(const std::string &name) {
        typename std::map<std::string, SmallStack<T>>::iterator iter = table.find(name);
        internal_assert(iter != table.end()) << "Name not in Scope: " << name << "\n"
                                             << *this << "\n";
        iter->second.pop();
        if (iter->second.empty()) {
            table.erase(iter);
        }
    }

    /** Pop a name using a token returned by push instead of a string. */
    void pop(PushToken p) {
        p.iter->second.pop();
        if (p.iter->second.empty()) {
            table.erase(p.iter);
        }
    }

    /** Iterate through the scope. Does not capture any containing scope. */
    class const_iterator {
        typename std::map<std::string, SmallStack<T>>::const_iterator iter;

    public:
        explicit const_iterator(const typename std::map<std::string, SmallStack<T>>::const_iterator &i)
            : iter(i) {
        }

        const_iterator() = default;

        bool operator!=(const const_iterator &other) {
            return iter != other.iter;
        }

        void operator++() {
            ++iter;
        }

        const std::string &name() {
            return iter->first;
        }

        const SmallStack<T> &stack() {
            return iter->second;
        }

        template<typename T2 = T,
                 typename = std::enable_if_t<!std::is_void_v<T2>>>
        const T2 &value() {
            return iter->second.top_ref();
        }
    };

    const_iterator cbegin() const {
        return const_iterator(table.begin());
    }

    const_iterator cend() const {
        return const_iterator(table.end());
    }

    void swap(Scope<T> &other) noexcept {
        table.swap(other.table);
        std::swap(containing_scope, other.containing_scope);
    }
};

template<typename T>
std::ostream &operator<<(std::ostream &stream, const Scope<T> &s) {
    stream << "{\n";
    typename Scope<T>::const_iterator iter;
    for (iter = s.cbegin(); iter != s.cend(); ++iter) {
        stream << "  " << iter.name() << "\n";
    }
    stream << "}";
    return stream;
}

/** Helper class for pushing/popping Scope<> values, to allow
 * for early-exit in Visitor/Mutators that preserves correctness.
 * Note that this name can be a bit confusing, since there are two "scopes"
 * involved here:
 * - the Scope object itself
 * - the lifetime of this helper object
 * The "Scoped" in this class name refers to the latter, as it temporarily binds
 * a name within the scope of this helper's lifetime. */
template<typename T = void>
struct ScopedBinding {
    Scope<T> *scope = nullptr;
    typename Scope<T>::PushToken token;

    ScopedBinding() = default;

    ScopedBinding(Scope<T> &s, const std::string &n, T value)
        : scope(&s), token(scope->push(n, std::move(value))) {
    }

    ScopedBinding(bool condition, Scope<T> &s, const std::string &n, const T &value)
        : scope(condition ? &s : nullptr),
          token(condition ? scope->push(n, value) : typename Scope<T>::PushToken{}) {
    }

    bool bound() const {
        return scope != nullptr;
    }

    ~ScopedBinding() {
        if (scope) {
            scope->pop(token);
        }
    }

    // allow move but not copy
    ScopedBinding(const ScopedBinding &that) = delete;
    ScopedBinding(ScopedBinding &&that) noexcept
        : scope(that.scope),
          token(that.token) {
        // The move constructor must null out scope, so we don't try to pop it
        that.scope = nullptr;
    }

    void operator=(const ScopedBinding &that) = delete;
    void operator=(ScopedBinding &&that) = delete;
};

template<>
struct ScopedBinding<void> {
    Scope<> *scope;
    Scope<>::PushToken token;
    ScopedBinding(Scope<> &s, const std::string &n)
        : scope(&s), token(scope->push(n)) {
    }
    ScopedBinding(bool condition, Scope<> &s, const std::string &n)
        : scope(condition ? &s : nullptr),
          token(condition ? scope->push(n) : Scope<>::PushToken{}) {
    }
    ~ScopedBinding() {
        if (scope) {
            scope->pop(token);
        }
    }

    // allow move but not copy
    ScopedBinding(const ScopedBinding &that) = delete;
    ScopedBinding(ScopedBinding &&that) noexcept
        : scope(that.scope),
          token(that.token) {
        // The move constructor must null out scope, so we don't try to pop it
        that.scope = nullptr;
    }

    void operator=(const ScopedBinding &that) = delete;
    void operator=(ScopedBinding &&that) = delete;
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_TUPLE_H
#define HALIDE_TUPLE_H

/** \file
 *
 * Defines Tuple - the front-end handle on small arrays of expressions.
 */
#include <vector>


namespace Halide {

class FuncRef;

/** Create a small array of Exprs for defining and calling functions
 * with multiple outputs. */
class Tuple {
private:
    std::vector<Expr> exprs;

public:
    /** The number of elements in the tuple. */
    size_t size() const {
        return exprs.size();
    }

    /** Get a reference to an element. */
    Expr &operator[](size_t x) {
        user_assert(x < exprs.size()) << "Tuple access out of bounds\n";
        return exprs[x];
    }

    /** Get a copy of an element. */
    Expr operator[](size_t x) const {
        user_assert(x < exprs.size()) << "Tuple access out of bounds\n";
        return exprs[x];
    }

    /** Construct a Tuple of a single Expr */
    explicit Tuple(Expr e) {
        exprs.emplace_back(std::move(e));
    }

    /** Construct a Tuple from some Exprs. */
    //@{
    template<typename... Args>
    Tuple(const Expr &a, const Expr &b, Args &&...args) {
        exprs = std::vector<Expr>{a, b, std::forward<Args>(args)...};
    }
    //@}

    /** Construct a Tuple from a vector of Exprs */
    explicit HALIDE_NO_USER_CODE_INLINE Tuple(const std::vector<Expr> &e)
        : exprs(e) {
        user_assert(!e.empty()) << "Tuples must have at least one element\n";
    }

    /** Construct a Tuple from a function reference. */
    Tuple(const FuncRef &);

    /** Treat the tuple as a vector of Exprs */
    const std::vector<Expr> &as_vector() const {
        return exprs;
    }
};

}  // namespace Halide

#endif

namespace Halide {

namespace Internal {
/** Is the expression either an IntImm, a FloatImm, a StringImm, or a
 * Cast of the same, or a Ramp or Broadcast of the same. Doesn't do
 * any constant folding. */
bool is_const(const Expr &e);

/** Is the expression an IntImm, FloatImm of a particular value, or a
 * Cast, or Broadcast of the same. */
bool is_const(const Expr &e, int64_t v);

/** If an expression is an IntImm or a Broadcast of an IntImm, return
 * a its value. Otherwise returns std::nullopt. */
std::optional<int64_t> as_const_int(const Expr &e);

/** If an expression is a UIntImm or a Broadcast of a UIntImm, return
 * its value. Otherwise returns std::nullopt. */
std::optional<uint64_t> as_const_uint(const Expr &e);

/** If an expression is a FloatImm or a Broadcast of a FloatImm,
 * return its value. Otherwise returns std::nullopt. */
std::optional<double> as_const_float(const Expr &e);

/** Is the expression a constant integer power of two. Returns log base two of
 * the expression if it is, or std::nullopt if not. Also returns std::nullopt
 * for non-integer types. */
// @{
std::optional<int> is_const_power_of_two_integer(const Expr &e);
std::optional<int> is_const_power_of_two_integer(uint64_t);
std::optional<int> is_const_power_of_two_integer(int64_t);
// @}

/** Is the expression a const (as defined by is_const), and also
 * strictly greater than zero (in all lanes, if a vector expression) */
bool is_positive_const(const Expr &e);

/** Is the expression a const (as defined by is_const), and also
 * strictly less than zero (in all lanes, if a vector expression) */
bool is_negative_const(const Expr &e);

/** Is the expression an undef */
bool is_undef(const Expr &e);

/** Is the expression a const (as defined by is_const), and also equal
 * to zero (in all lanes, if a vector expression) */
bool is_const_zero(const Expr &e);

/** Is the expression a const (as defined by is_const), and also equal
 * to one (in all lanes, if a vector expression) */
bool is_const_one(const Expr &e);

/** Is the statement a no-op (which we represent as either an
 * undefined Stmt, or as an Evaluate node of a constant) */
bool is_no_op(const Stmt &s);

/** Does the expression
 * 1) Take on the same value no matter where it appears in a Stmt, and
 * 2) Evaluating it has no side-effects
 */
bool is_pure(const Expr &e);

/** Construct an immediate of the given type from any numeric C++ type. */
// @{
Expr make_const(Type t, int64_t val);
Expr make_const(Type t, uint64_t val);
Expr make_const(Type t, double val);
inline Expr make_const(Type t, int32_t val) {
    return make_const(t, (int64_t)val);
}
inline Expr make_const(Type t, uint32_t val) {
    return make_const(t, (uint64_t)val);
}
inline Expr make_const(Type t, int16_t val) {
    return make_const(t, (int64_t)val);
}
inline Expr make_const(Type t, uint16_t val) {
    return make_const(t, (uint64_t)val);
}
inline Expr make_const(Type t, int8_t val) {
    return make_const(t, (int64_t)val);
}
inline Expr make_const(Type t, uint8_t val) {
    return make_const(t, (uint64_t)val);
}
inline Expr make_const(Type t, bool val) {
    return make_const(t, (uint64_t)val);
}
inline Expr make_const(Type t, float val) {
    return make_const(t, (double)val);
}
inline Expr make_const(Type t, float16_t val) {
    return make_const(t, (double)val);
}
// @}

/** Construct a unique signed_integer_overflow Expr */
Expr make_signed_integer_overflow(Type type);

/** Check if an expression is a signed_integer_overflow */
bool is_signed_integer_overflow(const Expr &expr);

/** Check if a constant value can be correctly represented as the given type. */
void check_representable(Type t, int64_t val);

/** Construct a boolean constant from a C++ boolean value.
 * May also be a vector if width is given.
 * It is not possible to coerce a C++ boolean to Expr because
 * if we provide such a path then char objects can ambiguously
 * be converted to Halide Expr or to std::string.  The problem
 * is that C++ does not have a real bool type - it is in fact
 * close enough to char that C++ does not know how to distinguish them.
 * make_bool is the explicit coercion. */
Expr make_bool(bool val, int lanes = 1);

/** Construct the representation of zero in the given type */
Expr make_zero(Type t);

/** Construct the representation of one in the given type */
Expr make_one(Type t);

/** Construct the representation of two in the given type */
Expr make_two(Type t);

/** Construct the constant boolean true. May also be a vector of
 * trues, if a lanes argument is given. */
Expr const_true(int lanes = 1);

/** Construct the constant boolean false. May also be a vector of
 * falses, if a lanes argument is given. */
Expr const_false(int lanes = 1);

/** Attempt to cast an expression to a smaller type while provably not losing
 * information. If it can't be done, return an undefined Expr.
 *
 * Optionally accepts a scope giving the constant bounds of any variables, and a
 * map that gives the constant bounds of exprs already analyzed to avoid redoing
 * work across many calls to lossless_cast. It is not safe to use this optional
 * map in contexts where the same Expr object may take on a different value. For
 * example: (let x = 4 in some_expr_object) + (let x = 5 in
 * the_same_expr_object)).  It is safe to use it after uniquify_variable_names
 * has been run. */
Expr lossless_cast(Type t, Expr e,
                   const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope(),
                   std::map<Expr, ConstantInterval, ExprCompare> *cache = nullptr);

/** Attempt to negate x without introducing new IR and without overflow.
 * If it can't be done, return an undefined Expr. */
Expr lossless_negate(const Expr &x);

/** Coerce the two expressions to have the same type, using C-style
 * casting rules. For the purposes of casting, a boolean type is
 * UInt(1). We use the following procedure:
 *
 * If the types already match, do nothing.
 *
 * Then, if one type is a vector and the other is a scalar, the scalar
 * is broadcast to match the vector width, and we continue.
 *
 * Then, if one type is floating-point and the other is not, the
 * non-float is cast to the floating-point type, and we're done.
 *
 * Then, if both types are unsigned ints, the one with fewer bits is
 * cast to match the one with more bits and we're done.
 *
 * Then, if both types are signed ints, the one with fewer bits is
 * cast to match the one with more bits and we're done.
 *
 * Finally, if one type is an unsigned int and the other type is a signed
 * int, both are cast to a signed int with the greater of the two
 * bit-widths. For example, matching an Int(8) with a UInt(16) results
 * in an Int(16).
 *
 */
void match_types(Expr &a, Expr &b);

/** Asserts that both expressions are integer types and are either
 * both signed or both unsigned. If one argument is scalar and the
 * other a vector, the scalar is broadcasted to have the same number
 * of lanes as the vector. If one expression is of narrower type than
 * the other, it is widened to the bit width of the wider. */
void match_types_bitwise(Expr &a, Expr &b, const char *op_name);

/** Halide's vectorizable transcendentals. */
// @{
Expr halide_log(const Expr &a);
Expr halide_exp(const Expr &a);
Expr halide_erf(const Expr &a);
// @}

/** Raise an expression to an integer power by repeatedly multiplying
 * it by itself. */
Expr raise_to_integer_power(Expr a, int64_t b);

/** Split a boolean condition into vector of ANDs. If 'cond' is undefined,
 * return an empty vector. */
void split_into_ands(const Expr &cond, std::vector<Expr> &result);

/** A builder to help create Exprs representing halide_buffer_t
 * structs (e.g. foo.buffer) via calls to halide_buffer_init. Fill out
 * the fields and then call build. The resulting Expr will be a call
 * to halide_buffer_init with the struct members as arguments. If the
 * buffer_memory field is undefined, it uses a call to alloca to make
 * some stack memory for the buffer. If the shape_memory field is
 * undefined, it similarly uses stack memory for the shape. If the
 * shape_memory field is null, it uses the dim field already in the
 * buffer. Other unitialized fields will take on a value of zero in
 * the constructed buffer. */
struct BufferBuilder {
    Expr buffer_memory, shape_memory;
    Expr host, device, device_interface;
    Type type;
    int dimensions = 0;
    std::vector<Expr> mins, extents, strides;
    Expr host_dirty, device_dirty;
    Expr build() const;
};

/** If e is a ramp expression with stride, default 1, return the base,
 * otherwise undefined. */
Expr strided_ramp_base(const Expr &e, int stride = 1);

/** Implementations of division and mod that are specific to Halide.
 * Use these implementations; do not use native C division or mod to
 * simplify Halide expressions. Halide division and modulo satisify
 * the Euclidean definition of division for integers a and b:
 *
 /code
 when b != 0, (a/b)*b + a%b = a
 0 <= a%b < |b|
 /endcode
 *
 * Additionally, mod by zero returns zero, and div by zero returns
 * zero. This makes mod and div total functions.
 */
// @{
template<typename T>
inline T mod_imp(T a, T b) {
    Type t = type_of<T>();
    if (!t.is_float() && b == 0) {
        return 0;
    } else if (t.is_int()) {
        int64_t ia = a;
        int64_t ib = b;
        int64_t a_neg = ia >> 63;
        int64_t b_neg = ib >> 63;
        int64_t b_zero = (ib == 0) ? -1 : 0;
        ia -= a_neg;
        int64_t r = ia % (ib | b_zero);
        r += (a_neg & ((ib ^ b_neg) + ~b_neg));
        r &= ~b_zero;
        return r;
    } else {
        return a % b;
    }
}

template<typename T>
inline T div_imp(T a, T b) {
    Type t = type_of<T>();
    if (!t.is_float() && b == 0) {
        return (T)0;
    } else if (t.is_int()) {
        // Do it as 64-bit
        int64_t ia = a;
        int64_t ib = b;
        int64_t a_neg = ia >> 63;
        int64_t b_neg = ib >> 63;
        int64_t b_zero = (ib == 0) ? -1 : 0;
        ib -= b_zero;
        ia -= a_neg;
        int64_t q = ia / ib;
        q += a_neg & (~b_neg - b_neg);
        q &= ~b_zero;
        return (T)q;
    } else {
        return a / b;
    }
}
// @}

// Special cases for float, double.
template<>
inline float mod_imp<float>(float a, float b) {
    float f = a - b * (floorf(a / b));
    // The remainder has the same sign as b.
    return f;
}
template<>
inline double mod_imp<double>(double a, double b) {
    double f = a - b * (std::floor(a / b));
    return f;
}

template<>
inline float div_imp<float>(float a, float b) {
    return a / b;
}
template<>
inline double div_imp<double>(double a, double b) {
    return a / b;
}

/** Return an Expr that is identical to the input Expr, but with
 * all calls to likely() and likely_if_innermost() removed. */
Expr remove_likelies(const Expr &e);

/** Return a Stmt that is identical to the input Stmt, but with
 * all calls to likely() and likely_if_innermost() removed. */
Stmt remove_likelies(const Stmt &s);

/** Return an Expr that is identical to the input Expr, but with
 * all calls to promise_clamped() and unsafe_promise_clamped() removed. */
Expr remove_promises(const Expr &e);

/** Return a Stmt that is identical to the input Stmt, but with
 * all calls to promise_clamped() and unsafe_promise_clamped() removed. */
Stmt remove_promises(const Stmt &s);

/** If the expression is a tag helper call, remove it and return
 * the tagged expression. If not, returns the expression. */
Expr unwrap_tags(const Expr &e);

template<typename T>
struct is_printable_arg {
    static constexpr bool value = std::is_convertible_v<T, const char *> ||
                                  std::is_convertible_v<T, Halide::Expr>;
};

template<typename... Args>
struct all_are_printable_args : meta_and<is_printable_arg<Args>...> {};

template<typename... Args>
inline constexpr bool all_are_printable_args_v = all_are_printable_args<Args...>::value;

// Secondary args to print can be Exprs or const char *
inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args) {
}

template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, const char *arg, Args &&...more_args) {
    args.emplace_back(std::string(arg));
    collect_print_args(args, std::forward<Args>(more_args)...);
}

template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE void collect_print_args(std::vector<Expr> &args, Expr arg, Args &&...more_args) {
    args.push_back(std::move(arg));
    collect_print_args(args, std::forward<Args>(more_args)...);
}

Expr requirement_failed_error(Expr condition, const std::vector<Expr> &args);

Expr memoize_tag_helper(Expr result, const std::vector<Expr> &cache_key_values);

/** Reset the counters used for random-number seeds in random_float/int/uint.
 * (Note that the counters are incremented for each call, even if a seed is passed in.)
 * This is used for multitarget compilation to ensure that each subtarget gets
 * the same sequence of random numbers. */
void reset_random_counters();

}  // namespace Internal

/** Cast an expression to the halide type corresponding to the C++ type T. */
template<typename T>
inline Expr cast(Expr a) {
    return cast(type_of<T>(), std::move(a));
}

/** Cast an expression to a new type. */
Expr cast(Type t, Expr a);

/** Return the sum of two expressions, doing any necessary type
 * coercion using \ref Internal::match_types */
Expr operator+(Expr a, Expr b);

/** Add an expression and a constant integer. Coerces the type of the
 * integer to match the type of the expression. Errors if the integer
 * cannot be represented in the type of the expression. */
// @{
Expr operator+(Expr a, int b);

/** Add a constant integer and an expression. Coerces the type of the
 * integer to match the type of the expression. Errors if the integer
 * cannot be represented in the type of the expression. */
Expr operator+(int a, Expr b);

/** Modify the first expression to be the sum of two expressions,
 * without changing its type. This casts the second argument to match
 * the type of the first. */
Expr &operator+=(Expr &a, Expr b);

/** Return the difference of two expressions, doing any necessary type
 * coercion using \ref Internal::match_types */
Expr operator-(Expr a, Expr b);

/** Subtracts a constant integer from an expression. Coerces the type of the
 * integer to match the type of the expression. Errors if the integer
 * cannot be represented in the type of the expression. */
Expr operator-(Expr a, int b);

/** Subtracts an expression from a constant integer. Coerces the type
 * of the integer to match the type of the expression. Errors if the
 * integer cannot be represented in the type of the expression. */
Expr operator-(int a, Expr b);

/** Return the negative of the argument. Does no type casting, so more
 * formally: return that number which when added to the original,
 * yields zero of the same type. For unsigned integers the negative is
 * still an unsigned integer. E.g. in UInt(8), the negative of 56 is
 * 200, because 56 + 200 == 0 */
Expr operator-(Expr a);

/** Modify the first expression to be the difference of two expressions,
 * without changing its type. This casts the second argument to match
 * the type of the first. */
Expr &operator-=(Expr &a, Expr b);

/** Return the product of two expressions, doing any necessary type
 * coercion using \ref Internal::match_types */
Expr operator*(Expr a, Expr b);

/** Multiply an expression and a constant integer. Coerces the type of the
 * integer to match the type of the expression. Errors if the integer
 * cannot be represented in the type of the expression. */
Expr operator*(Expr a, int b);

/** Multiply a constant integer and an expression. Coerces the type of
 * the integer to match the type of the expression. Errors if the
 * integer cannot be represented in the type of the expression. */
Expr operator*(int a, Expr b);

/** Modify the first expression to be the product of two expressions,
 * without changing its type. This casts the second argument to match
 * the type of the first. */
Expr &operator*=(Expr &a, Expr b);

/** Return the ratio of two expressions, doing any necessary type
 * coercion using \ref Internal::match_types. Note that integer
 * division in Halide is not the same as integer division in C-like
 * languages in two ways.
 *
 * First, signed integer division in Halide rounds according to the
 * sign of the denominator. This means towards minus infinity for
 * positive denominators, and towards positive infinity for negative
 * denominators. This is unlike C, which rounds towards zero. This
 * decision ensures that upsampling expressions like f(x/2, y/2) don't
 * have funny discontinuities when x and y cross zero.
 *
 * Second, division by zero returns zero instead of faulting. For
 * types where overflow is defined behavior, division of the largest
 * negative signed integer by -1 returns the larged negative signed
 * integer for the type (i.e. it wraps). This ensures that a division
 * operation can never have a side-effect, which is helpful in Halide
 * because scheduling directives can expand the domain of computation
 * of a Func, potentially introducing new zero-division.
 */
Expr operator/(Expr a, Expr b);

/** Modify the first expression to be the ratio of two expressions,
 * without changing its type. This casts the second argument to match
 * the type of the first. Note that signed integer division in Halide
 * rounds towards minus infinity, unlike C, which rounds towards
 * zero. */
Expr &operator/=(Expr &a, Expr b);

/** Divides an expression by a constant integer. Coerces the type
 * of the integer to match the type of the expression. Errors if the
 * integer cannot be represented in the type of the expression. */
Expr operator/(Expr a, int b);

/** Divides a constant integer by an expression. Coerces the type
 * of the integer to match the type of the expression. Errors if the
 * integer cannot be represented in the type of the expression. */
Expr operator/(int a, Expr b);

/** Return the first argument reduced modulo the second, doing any
 * necessary type coercion using \ref Internal::match_types. There are
 * two key differences between C-like languages and Halide for the
 * modulo operation, which complement the way division works.
 *
 * First, the result is never negative, so x % 2 is always zero or
 * one, unlike in C-like languages. x % -2 is equivalent, and is also
 * always zero or one. Second, mod by zero evaluates to zero (unlike
 * in C, where it faults). This makes modulo, like division, a
 * side-effect-free operation. */
Expr operator%(Expr a, Expr b);

/** Mods an expression by a constant integer. Coerces the type
 * of the integer to match the type of the expression. Errors if the
 * integer cannot be represented in the type of the expression. */
Expr operator%(Expr a, int b);

/** Mods a constant integer by an expression. Coerces the type
 * of the integer to match the type of the expression. Errors if the
 * integer cannot be represented in the type of the expression. */
Expr operator%(int a, Expr b);

/** Return a boolean expression that tests whether the first argument
 * is greater than the second, after doing any necessary type coercion
 * using \ref Internal::match_types */
Expr operator>(Expr a, Expr b);

/** Return a boolean expression that tests whether an expression is
 * greater than a constant integer. Coerces the integer to the type of
 * the expression. Errors if the integer is not representable in that
 * type. */
Expr operator>(Expr a, int b);

/** Return a boolean expression that tests whether a constant integer is
 * greater than an expression. Coerces the integer to the type of
 * the expression. Errors if the integer is not representable in that
 * type. */
Expr operator>(int a, Expr b);

/** Return a boolean expression that tests whether the first argument
 * is less than the second, after doing any necessary type coercion
 * using \ref Internal::match_types */
Expr operator<(Expr a, Expr b);

/** Return a boolean expression that tests whether an expression is
 * less than a constant integer. Coerces the integer to the type of
 * the expression. Errors if the integer is not representable in that
 * type. */
Expr operator<(Expr a, int b);

/** Return a boolean expression that tests whether a constant integer is
 * less than an expression. Coerces the integer to the type of
 * the expression. Errors if the integer is not representable in that
 * type. */
Expr operator<(int a, Expr b);

/** Return a boolean expression that tests whether the first argument
 * is less than or equal to the second, after doing any necessary type
 * coercion using \ref Internal::match_types */
Expr operator<=(Expr a, Expr b);

/** Return a boolean expression that tests whether an expression is
 * less than or equal to a constant integer. Coerces the integer to
 * the type of the expression. Errors if the integer is not
 * representable in that type. */
Expr operator<=(Expr a, int b);

/** Return a boolean expression that tests whether a constant integer
 * is less than or equal to an expression. Coerces the integer to the
 * type of the expression. Errors if the integer is not representable
 * in that type. */
Expr operator<=(int a, Expr b);

/** Return a boolean expression that tests whether the first argument
 * is greater than or equal to the second, after doing any necessary
 * type coercion using \ref Internal::match_types */
Expr operator>=(Expr a, Expr b);

/** Return a boolean expression that tests whether an expression is
 * greater than or equal to a constant integer. Coerces the integer to
 * the type of the expression. Errors if the integer is not
 * representable in that type. */
Expr operator>=(const Expr &a, int b);

/** Return a boolean expression that tests whether a constant integer
 * is greater than or equal to an expression. Coerces the integer to the
 * type of the expression. Errors if the integer is not representable
 * in that type. */
Expr operator>=(int a, const Expr &b);

/** Return a boolean expression that tests whether the first argument
 * is equal to the second, after doing any necessary type coercion
 * using \ref Internal::match_types */
Expr operator==(Expr a, Expr b);

/** Return a boolean expression that tests whether an expression is
 * equal to a constant integer. Coerces the integer to the type of the
 * expression. Errors if the integer is not representable in that
 * type. */
Expr operator==(Expr a, int b);

/** Return a boolean expression that tests whether a constant integer
 * is equal to an expression. Coerces the integer to the type of the
 * expression. Errors if the integer is not representable in that
 * type. */
Expr operator==(int a, Expr b);

/** Return a boolean expression that tests whether the first argument
 * is not equal to the second, after doing any necessary type coercion
 * using \ref Internal::match_types */
Expr operator!=(Expr a, Expr b);

/** Return a boolean expression that tests whether an expression is
 * not equal to a constant integer. Coerces the integer to the type of
 * the expression. Errors if the integer is not representable in that
 * type. */
Expr operator!=(Expr a, int b);

/** Return a boolean expression that tests whether a constant integer
 * is not equal to an expression. Coerces the integer to the type of
 * the expression. Errors if the integer is not representable in that
 * type. */
Expr operator!=(int a, Expr b);

/** Returns the logical and of the two arguments */
Expr operator&&(Expr a, Expr b);

/** Logical and of an Expr and a bool. Either returns the Expr or an
 * Expr representing false, depending on the bool. */
// @{
Expr operator&&(Expr a, bool b);
Expr operator&&(bool a, Expr b);
// @}

/** Returns the logical or of the two arguments */
Expr operator||(Expr a, Expr b);

/** Logical or of an Expr and a bool. Either returns the Expr or an
 * Expr representing true, depending on the bool. */
// @{
Expr operator||(Expr a, bool b);
Expr operator||(bool a, Expr b);
// @}

/** Returns the logical not the argument */
Expr operator!(Expr a);

/** Returns an expression representing the greater of the two
 * arguments, after doing any necessary type coercion using
 * \ref Internal::match_types. Vectorizes cleanly on most platforms
 * (with the exception of integer types on x86 without SSE4). */
Expr max(Expr a, Expr b);

/** Returns an expression representing the greater of an expression
 * and a constant integer.  The integer is coerced to the type of the
 * expression. Errors if the integer is not representable as that
 * type. Vectorizes cleanly on most platforms (with the exception of
 * integer types on x86 without SSE4). */
Expr max(Expr a, int b);

/** Returns an expression representing the greater of a constant
 * integer and an expression. The integer is coerced to the type of
 * the expression. Errors if the integer is not representable as that
 * type. Vectorizes cleanly on most platforms (with the exception of
 * integer types on x86 without SSE4). */
Expr max(int a, Expr b);

inline Expr max(float a, Expr b) {
    return max(Expr(a), std::move(b));
}
inline Expr max(Expr a, float b) {
    return max(std::move(a), Expr(b));
}

/** Returns an expression representing the greater of an expressions
 * vector, after doing any necessary type coersion using
 * \ref Internal::match_types. Vectorizes cleanly on most platforms
 * (with the exception of integer types on x86 without SSE4).
 * The expressions are folded from right ie. max(.., max(.., ..)).
 * The arguments can be any mix of types but must all be convertible to Expr. */
template<typename A, typename B, typename C, typename... Rest,
         std::enable_if_t<Internal::all_are_convertible<Expr, Rest...>::value> * = nullptr>
inline Expr max(A &&a, B &&b, C &&c, Rest &&...rest) {
    return max(std::forward<A>(a), max(std::forward<B>(b), std::forward<C>(c), std::forward<Rest>(rest)...));
}

Expr min(Expr a, Expr b);

/** Returns an expression representing the lesser of an expression
 * and a constant integer.  The integer is coerced to the type of the
 * expression. Errors if the integer is not representable as that
 * type. Vectorizes cleanly on most platforms (with the exception of
 * integer types on x86 without SSE4). */
Expr min(Expr a, int b);

/** Returns an expression representing the lesser of a constant
 * integer and an expression. The integer is coerced to the type of
 * the expression. Errors if the integer is not representable as that
 * type. Vectorizes cleanly on most platforms (with the exception of
 * integer types on x86 without SSE4). */
Expr min(int a, Expr b);

inline Expr min(float a, Expr b) {
    return min(Expr(a), std::move(b));
}
inline Expr min(Expr a, float b) {
    return min(std::move(a), Expr(b));
}

/** Returns an expression representing the lesser of an expressions
 * vector, after doing any necessary type coersion using
 * \ref Internal::match_types. Vectorizes cleanly on most platforms
 * (with the exception of integer types on x86 without SSE4).
 * The expressions are folded from right ie. min(.., min(.., ..)).
 * The arguments can be any mix of types but must all be convertible to Expr. */
template<typename A, typename B, typename C, typename... Rest,
         std::enable_if_t<Internal::all_are_convertible<Expr, Rest...>::value> * = nullptr>
inline Expr min(A &&a, B &&b, C &&c, Rest &&...rest) {
    return min(std::forward<A>(a), min(std::forward<B>(b), std::forward<C>(c), std::forward<Rest>(rest)...));
}

/** Operators on floats treats those floats as Exprs. Making these
 * explicit prevents implicit float->int casts that might otherwise
 * occur. */
// @{
inline Expr operator+(Expr a, float b) {
    return std::move(a) + Expr(b);
}
inline Expr operator+(float a, Expr b) {
    return Expr(a) + std::move(b);
}
inline Expr operator-(Expr a, float b) {
    return std::move(a) - Expr(b);
}
inline Expr operator-(float a, Expr b) {
    return Expr(a) - std::move(b);
}
inline Expr operator*(Expr a, float b) {
    return std::move(a) * Expr(b);
}
inline Expr operator*(float a, Expr b) {
    return Expr(a) * std::move(b);
}
inline Expr operator/(Expr a, float b) {
    return std::move(a) / Expr(b);
}
inline Expr operator/(float a, Expr b) {
    return Expr(a) / std::move(b);
}
inline Expr operator%(Expr a, float b) {
    return std::move(a) % Expr(b);
}
inline Expr operator%(float a, Expr b) {
    return Expr(a) % std::move(b);
}
inline Expr operator>(Expr a, float b) {
    return std::move(a) > Expr(b);
}
inline Expr operator>(float a, Expr b) {
    return Expr(a) > std::move(b);
}
inline Expr operator<(Expr a, float b) {
    return std::move(a) < Expr(b);
}
inline Expr operator<(float a, Expr b) {
    return Expr(a) < std::move(b);
}
inline Expr operator>=(Expr a, float b) {
    return std::move(a) >= Expr(b);
}
inline Expr operator>=(float a, Expr b) {
    return Expr(a) >= std::move(b);
}
inline Expr operator<=(Expr a, float b) {
    return std::move(a) <= Expr(b);
}
inline Expr operator<=(float a, Expr b) {
    return Expr(a) <= std::move(b);
}
inline Expr operator==(Expr a, float b) {
    return std::move(a) == Expr(b);
}
inline Expr operator==(float a, Expr b) {
    return Expr(a) == std::move(b);
}
inline Expr operator!=(Expr a, float b) {
    return std::move(a) != Expr(b);
}
inline Expr operator!=(float a, Expr b) {
    return Expr(a) != std::move(b);
}
// @}

/** Clamps an expression to lie within the given bounds. The bounds
 * are type-cast to match the expression. Vectorizes as well as min/max. */
Expr clamp(Expr a, const Expr &min_val, const Expr &max_val);

/** Returns the absolute value of a signed integer or floating-point
 * expression. Vectorizes cleanly. Unlike in C, abs of a signed
 * integer returns an unsigned integer of the same bit width. This
 * means that abs of the most negative integer doesn't overflow. */
Expr abs(Expr a);

/** Return the absolute difference between two values. Vectorizes
 * cleanly. Returns an unsigned value of the same bit width. There are
 * various ways to write this yourself, but they contain numerous
 * gotchas and don't always compile to good code, so use this
 * instead. */
Expr absd(Expr a, Expr b);

/** Returns an expression similar to the ternary operator in C, except
 * that it always evaluates all arguments. If the first argument is
 * true, then return the second, else return the third. Typically
 * vectorizes cleanly, but benefits from SSE41 or newer on x86. */
Expr select(Expr condition, Expr true_value, Expr false_value);

/** A multi-way variant of select similar to a switch statement in C,
 * which can accept multiple conditions and values in pairs. Evaluates
 * to the first value for which the condition is true. Returns the
 * final value if all conditions are false. */
template<typename... Args,
         std::enable_if_t<Internal::all_are_convertible<Expr, Args...>::value> * = nullptr>
inline Expr select(Expr c0, Expr v0, Expr c1, Expr v1, Args &&...args) {
    return select(std::move(c0), std::move(v0), select(std::move(c1), std::move(v1), std::forward<Args>(args)...));
}

/** Equivalent of ternary select(), but taking/returning tuples. If the condition is
 * a Tuple, it must match the size of the true and false Tuples. */
// @{
Tuple select(const Tuple &condition, const Tuple &true_value, const Tuple &false_value);
Tuple select(const Expr &condition, const Tuple &true_value, const Tuple &false_value);
// @}

/** Equivalent of multiway select(), but taking/returning tuples. If the condition is
 * a Tuple, it must match the size of the true and false Tuples. */
// @{
template<typename... Args>
inline Tuple select(const Tuple &c0, const Tuple &v0, const Tuple &c1, const Tuple &v1, Args &&...args) {
    return select(c0, v0, select(c1, v1, std::forward<Args>(args)...));
}
template<typename... Args>
inline Tuple select(const Expr &c0, const Tuple &v0, const Expr &c1, const Tuple &v1, Args &&...args) {
    return select(c0, v0, select(c1, v1, std::forward<Args>(args)...));
}
// @}

/** select applied to FuncRefs (e.g. select(x < 100, f(x), g(x))) is assumed to
 * return an Expr. A runtime error is produced if this is applied to
 * tuple-valued Funcs. In that case you should explicitly cast the second and
 * third args to Tuple to remove the ambiguity. */
// @{
Expr select(const Expr &condition, const FuncRef &true_value, const FuncRef &false_value);
template<typename... Args>
inline Expr select(const Expr &c0, const FuncRef &v0, const Expr &c1, const FuncRef &v1, Args &&...args) {
    return select(c0, v0, select(c1, v1, std::forward<Args>(args)...));
}
// @}

/** Oftentimes we want to pack a list of expressions with the same type
 * into a channel dimension, e.g.,
 * img(x, y, c) = select(c == 0, 100, // Red
 *                       c == 1, 50,  // Green
 *                               25); // Blue
 * This is tedious when the list is long. The following function
 * provide convinent syntax that allow one to write:
 * img(x, y, c) = mux(c, {100, 50, 25});
 *
 * As with the select equivalent, if the first argument (the index) is
 * out of range, the expression evaluates to the last value.
 */
// @{
Expr mux(const Expr &id, const std::initializer_list<Expr> &values);
Expr mux(const Expr &id, const std::vector<Expr> &values);
Expr mux(const Expr &id, const Tuple &values);
Expr mux(const Expr &id, const std::initializer_list<FuncRef> &values);
Tuple mux(const Expr &id, const std::initializer_list<Tuple> &values);
Tuple mux(const Expr &id, const std::vector<Tuple> &values);
// @}

/** Return the sine of a floating-point expression. If the argument is
 * not floating-point, it is cast to Float(32). Does not vectorize
 * well. */
Expr sin(Expr x);

/** Return the arcsine of a floating-point expression. If the argument
 * is not floating-point, it is cast to Float(32). Does not vectorize
 * well. */
Expr asin(Expr x);

/** Return the cosine of a floating-point expression. If the argument
 * is not floating-point, it is cast to Float(32). Does not vectorize
 * well. */
Expr cos(Expr x);

/** Return the arccosine of a floating-point expression. If the
 * argument is not floating-point, it is cast to Float(32). Does not
 * vectorize well. */
Expr acos(Expr x);

/** Return the tangent of a floating-point expression. If the argument
 * is not floating-point, it is cast to Float(32). Does not vectorize
 * well. */
Expr tan(Expr x);

/** Return the arctangent of a floating-point expression. If the
 * argument is not floating-point, it is cast to Float(32). Does not
 * vectorize well. */
Expr atan(Expr x);

/** Return the angle of a floating-point gradient. If the argument is
 * not floating-point, it is cast to Float(32). Does not vectorize
 * well. */
Expr atan2(Expr y, Expr x);

/** Return the hyperbolic sine of a floating-point expression.  If the
 *  argument is not floating-point, it is cast to Float(32). Does not
 *  vectorize well. */
Expr sinh(Expr x);

/** Return the hyperbolic arcsinhe of a floating-point expression.  If
 * the argument is not floating-point, it is cast to Float(32). Does
 * not vectorize well. */
Expr asinh(Expr x);

/** Return the hyperbolic cosine of a floating-point expression.  If
 * the argument is not floating-point, it is cast to Float(32). Does
 * not vectorize well. */
Expr cosh(Expr x);

/** Return the hyperbolic arccosine of a floating-point expression.
 * If the argument is not floating-point, it is cast to
 * Float(32). Does not vectorize well. */
Expr acosh(Expr x);

/** Return the hyperbolic tangent of a floating-point expression.  If
 * the argument is not floating-point, it is cast to Float(32). Does
 * not vectorize well. */
Expr tanh(Expr x);

/** Return the hyperbolic arctangent of a floating-point expression.
 * If the argument is not floating-point, it is cast to
 * Float(32). Does not vectorize well. */
Expr atanh(Expr x);

/** Return the square root of a floating-point expression. If the
 * argument is not floating-point, it is cast to Float(32). Typically
 * vectorizes cleanly. */
Expr sqrt(Expr x);

/** Return the square root of the sum of the squares of two
 * floating-point expressions. If the argument is not floating-point,
 * it is cast to Float(32). Vectorizes cleanly. */
Expr hypot(const Expr &x, const Expr &y);

/** Return the exponential of a floating-point expression. If the
 * argument is not floating-point, it is cast to Float(32). For
 * Float(64) arguments, this calls the system exp function, and does
 * not vectorize well. For Float(32) arguments, this function is
 * vectorizable, does the right thing for extremely small or extremely
 * large inputs, and is accurate up to the last bit of the
 * mantissa. Vectorizes cleanly. */
Expr exp(Expr x);

/** Return the logarithm of a floating-point expression. If the
 * argument is not floating-point, it is cast to Float(32). For
 * Float(64) arguments, this calls the system log function, and does
 * not vectorize well. For Float(32) arguments, this function is
 * vectorizable, does the right thing for inputs <= 0 (returns -inf or
 * nan), and is accurate up to the last bit of the
 * mantissa. Vectorizes cleanly. */
Expr log(Expr x);

/** Return one floating point expression raised to the power of
 * another. The type of the result is given by the type of the first
 * argument. If the first argument is not a floating-point type, it is
 * cast to Float(32). For Float(32), cleanly vectorizable, and
 * accurate up to the last few bits of the mantissa. Gets worse when
 * approaching overflow. Vectorizes cleanly. */
Expr pow(Expr x, Expr y);

/** Evaluate the error function erf. Only available for
 * Float(32). Accurate up to the last three bits of the
 * mantissa. Vectorizes cleanly. */
Expr erf(const Expr &x);

/** Fast vectorizable approximation to some trigonometric functions for
 * Float(32).  Absolute approximation error is less than 1e-5. Slow on x86 if
 * you don't have at least sse 4.1. */
// @{
Expr fast_sin(const Expr &x);
Expr fast_cos(const Expr &x);
// @}

/** Fast approximate cleanly vectorizable log for Float(32). Returns
 * nonsense for x <= 0.0f. Accurate up to the last 5 bits of the
 * mantissa. Vectorizes cleanly. Slow on x86 if you don't
 * have at least sse 4.1. */
Expr fast_log(const Expr &x);

/** Fast approximate cleanly vectorizable exp for Float(32). Returns
 * nonsense for inputs that would overflow or underflow. Typically
 * accurate up to the last 5 bits of the mantissa. Gets worse when
 * approaching overflow. Vectorizes cleanly. Slow on x86 if you don't
 * have at least sse 4.1. */
Expr fast_exp(const Expr &x);

/** Fast approximate cleanly vectorizable pow for Float(32). Returns
 * nonsense for x < 0.0f. Accurate up to the last 5 bits of the
 * mantissa for typical exponents. Gets worse when approaching
 * overflow. Vectorizes cleanly. Slow on x86 if you don't
 * have at least sse 4.1. */
Expr fast_pow(Expr x, Expr y);

/** Fast approximate inverse for Float(32). Corresponds to the rcpps
 * instruction on x86, and the vrecpe instruction on ARM. Vectorizes
 * cleanly. Note that this can produce slightly different results
 * across different implementations of the same architecture (e.g. AMD vs Intel),
 * even when strict_float is enabled. */
Expr fast_inverse(Expr x);

/** Fast approximate inverse square root for Float(32). Corresponds to
 * the rsqrtps instruction on x86, and the vrsqrte instruction on
 * ARM. Vectorizes cleanly. Note that this can produce slightly different results
 * across different implementations of the same architecture (e.g. AMD vs Intel),
 * even when strict_float is enabled. */
Expr fast_inverse_sqrt(Expr x);

/** Return the greatest whole number less than or equal to a
 * floating-point expression. If the argument is not floating-point,
 * it is cast to Float(32). The return value is still in floating
 * point, despite being a whole number. Vectorizes cleanly. */
Expr floor(Expr x);

/** Return the least whole number greater than or equal to a
 * floating-point expression. If the argument is not floating-point,
 * it is cast to Float(32). The return value is still in floating
 * point, despite being a whole number. Vectorizes cleanly. */
Expr ceil(Expr x);

/** Return the whole number closest to a floating-point expression. If the
 * argument is not floating-point, it is cast to Float(32). The return value is
 * still in floating point, despite being a whole number. On ties, we round
 * towards the nearest even integer. Note that this is not the same as
 * std::round in C, which rounds away from zero. On platforms without a native
 * instruction for this, it is emulated, and may be more expensive than
 * cast<int>(x + 0.5f) or similar. */
Expr round(Expr x);

/** Return the integer part of a floating-point expression. If the argument is
 * not floating-point, it is cast to Float(32). The return value is still in
 * floating point, despite being a whole number. Vectorizes cleanly. */
Expr trunc(Expr x);

/** Returns true if the argument is a Not a Number (NaN). Requires a
 * floating point argument.  Vectorizes cleanly.
 * Note that the Expr passed in will be evaluated in strict_float mode,
 * regardless of whether strict_float mode is enabled in the current Target. */
Expr is_nan(Expr x);

/** Returns true if the argument is Inf or -Inf. Requires a
 * floating point argument.  Vectorizes cleanly.
 * Note that the Expr passed in will be evaluated in strict_float mode,
 * regardless of whether strict_float mode is enabled in the current Target. */
Expr is_inf(Expr x);

/** Returns true if the argument is a finite value (ie, neither NaN nor Inf).
 * Requires a floating point argument.  Vectorizes cleanly.
 * Note that the Expr passed in will be evaluated in strict_float mode,
 * regardless of whether strict_float mode is enabled in the current Target. */
Expr is_finite(Expr x);

/** Return the fractional part of a floating-point expression. If the argument
 *  is not floating-point, it is cast to Float(32). The return value has the
 *  same sign as the original expression. Vectorizes cleanly. */
Expr fract(const Expr &x);

/** Reinterpret the bits of one value as another type. */
Expr reinterpret(Type t, Expr e);

template<typename T>
Expr reinterpret(Expr e) {
    return reinterpret(type_of<T>(), std::move(e));
}

/** Return the bitwise and of two expressions (which need not have the
 * same type).  The result type is the wider of the two expressions.
 * Only integral types are allowed and both expressions must be signed
 * or both must be unsigned. */
Expr operator&(Expr x, Expr y);

/** Return the bitwise and of an expression and an integer. The type
 * of the result is the type of the expression argument. */
// @{
Expr operator&(Expr x, int y);
Expr operator&(int x, Expr y);
// @}

/** Return the bitwise or of two expressions (which need not have the
 * same type).  The result type is the wider of the two expressions.
 * Only integral types are allowed and both expressions must be signed
 * or both must be unsigned. */
Expr operator|(Expr x, Expr y);

/** Return the bitwise or of an expression and an integer. The type of
 * the result is the type of the expression argument. */
// @{
Expr operator|(Expr x, int y);
Expr operator|(int x, Expr y);
// @}

/** Return the bitwise xor of two expressions (which need not have the
 * same type).  The result type is the wider of the two expressions.
 * Only integral types are allowed and both expressions must be signed
 * or both must be unsigned. */
Expr operator^(Expr x, Expr y);

/** Return the bitwise xor of an expression and an integer. The type
 * of the result is the type of the expression argument. */
// @{
Expr operator^(Expr x, int y);
Expr operator^(int x, Expr y);
// @}

/** Return the bitwise not of an expression. */
Expr operator~(Expr x);

/** Shift the bits of an integer value left. This is actually less
 * efficient than multiplying by 2^n, because Halide's optimization
 * passes understand multiplication, and will compile it to
 * shifting. This operator is only for if you really really need bit
 * shifting (e.g. because the exponent is a run-time parameter). The
 * type of the result is equal to the type of the first argument. Both
 * arguments must have integer type. */
// @{
Expr operator<<(Expr x, Expr y);
Expr operator<<(Expr x, int y);
// @}

/** Shift the bits of an integer value right. Does sign extension for
 * signed integers. This is less efficient than dividing by a power of
 * two. Halide's definition of division (always round to negative
 * infinity) means that all divisions by powers of two get compiled to
 * bit-shifting, and Halide's optimization routines understand
 * division and can work with it. The type of the result is equal to
 * the type of the first argument. Both arguments must have integer
 * type. */
// @{
Expr operator>>(Expr x, Expr y);
Expr operator>>(Expr x, int y);
// @}

/** Linear interpolate between the two values according to a weight.
 * \param zero_val The result when weight is 0
 * \param one_val The result when weight is 1
 * \param weight The interpolation amount
 *
 * Both zero_val and one_val must have the same type. All types are
 * supported, including bool.
 *
 * The weight is treated as its own type and must be float or an
 * unsigned integer type. It is scaled to the bit-size of the type of
 * x and y if they are integer, or converted to float if they are
 * float. Integer weights are converted to float via division by the
 * full-range value of the weight's type. Floating-point weights used
 * to interpolate between integer values must be between 0.0f and
 * 1.0f, and an error may be signaled if it is not provably so. (clamp
 * operators can be added to provide proof. Currently an error is only
 * signalled for constant weights.)
 *
 * For integer linear interpolation, out of range values cannot be
 * represented. In particular, weights that are conceptually less than
 * 0 or greater than 1.0 are not representable. As such the result is
 * always between x and y (inclusive of course). For lerp with
 * floating-point values and floating-point weight, the full range of
 * a float is valid, however underflow and overflow can still occur.
 *
 * Ordering is not required between zero_val and one_val:
 *     lerp(42, 69, .5f) == lerp(69, 42, .5f) == 56
 *
 * Results for integer types are for exactly rounded arithmetic. As
 * such, there are cases where 16-bit and float differ because 32-bit
 * floating-point (float) does not have enough precision to produce
 * the exact result. (Likely true for 32-bit integer
 * vs. double-precision floating-point as well.)
 *
 * At present, double precision and 64-bit integers are not supported.
 *
 * Generally, lerp will vectorize as if it were an operation on a type
 * twice the bit size of the inferred type for x and y.
 *
 * Some examples:
 * \code
 *
 *     // Since Halide does not have direct type delcarations, casts
 *     // below are used to indicate the types of the parameters.
 *     // Such casts not required or expected in actual code where types
 *     // are inferred.
 *
 *     lerp(cast<float>(x), cast<float>(y), cast<float>(w)) ->
 *       x * (1.0f - w) + y * w
 *
 *     lerp(cast<uint8_t>(x), cast<uint8_t>(y), cast<uint8_t>(w)) ->
 *       cast<uint8_t>(cast<uint8_t>(x) * (1.0f - cast<uint8_t>(w) / 255.0f) +
 *                     cast<uint8_t>(y) * cast<uint8_t>(w) / 255.0f + .5f)
 *
 *     // Note addition in Halide promoted uint8_t + int8_t to int16_t already,
 *     // the outer cast is added for clarity.
 *     lerp(cast<uint8_t>(x), cast<int8_t>(y), cast<uint8_t>(w)) ->
 *       cast<int16_t>(cast<uint8_t>(x) * (1.0f - cast<uint8_t>(w) / 255.0f) +
 *                     cast<int8_t>(y) * cast<uint8_t>(w) / 255.0f + .5f)
 *
 *     lerp(cast<int8_t>(x), cast<int8_t>(y), cast<float>(w)) ->
 *       cast<int8_t>(cast<int8_t>(x) * (1.0f - cast<float>(w)) +
 *                    cast<int8_t>(y) * cast<uint8_t>(w))
 *
 * \endcode
 * */
Expr lerp(Expr zero_val, Expr one_val, Expr weight);

/** Count the number of set bits in an expression. */
Expr popcount(Expr x);

/** Count the number of leading zero bits in an expression. If the expression is
 * zero, the result is the number of bits in the type. */
Expr count_leading_zeros(Expr x);

/** Count the number of trailing zero bits in an expression. If the expression is
 * zero, the result is the number of bits in the type. */
Expr count_trailing_zeros(Expr x);

/** Divide two integers, rounding towards zero. This is the typical
 * behavior of most hardware architectures, which differs from
 * Halide's division operator, which is Euclidean (rounds towards
 * -infinity). Will throw a runtime error if y is zero, or if y is -1
 * and x is the minimum signed integer. */
Expr div_round_to_zero(Expr x, Expr y);

/** Compute the remainder of dividing two integers, when division is
 * rounding toward zero. This is the typical behavior of most hardware
 * architectures, which differs from Halide's mod operator, which is
 * Euclidean (produces the remainder when division rounds towards
 * -infinity). Will throw a runtime error if y is zero. */
Expr mod_round_to_zero(Expr x, Expr y);

/** Return a random variable representing a uniformly distributed
 * float in the half-open interval [0.0f, 1.0f). For random numbers of
 * other types, use lerp with a random float as the last parameter.
 *
 * Optionally takes a seed.
 *
 * Note that:
 \code
 Expr x = random_float();
 Expr y = x + x;
 \endcode
 *
 * is very different to
 *
 \code
 Expr y = random_float() + random_float();
 \endcode
 *
 * The first doubles a random variable, and the second adds two
 * independent random variables.
 *
 * A given random variable takes on a unique value that depends
 * deterministically on the pure variables of the function they belong
 * to, the identity of the function itself, and which definition of
 * the function it is used in. They are, however, shared across tuple
 * elements.
 *
 * This function vectorizes cleanly.
 */
Expr random_float(Expr seed = Expr());

/** Return a random variable representing a uniformly distributed
 * unsigned 32-bit integer. See \ref random_float. Vectorizes cleanly. */
Expr random_uint(Expr seed = Expr());

/** Return a random variable representing a uniformly distributed
 * 32-bit integer. See \ref random_float. Vectorizes cleanly. */
Expr random_int(Expr seed = Expr());

/** Create an Expr that prints out its value whenever it is
 * evaluated. It also prints out everything else in the arguments
 * list, separated by spaces. This can include string literals. */
//@{
Expr print(const std::vector<Expr> &values);

template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE Expr print(Expr a, Args &&...args) {
    std::vector<Expr> collected_args = {std::move(a)};
    Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
    return print(collected_args);
}
//@}

/** Create an Expr that prints whenever it is evaluated, provided that
 * the condition is true. */
// @{
Expr print_when(Expr condition, const std::vector<Expr> &values);

template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE Expr print_when(Expr condition, Expr a, Args &&...args) {
    std::vector<Expr> collected_args = {std::move(a)};
    Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
    return print_when(std::move(condition), collected_args);
}

// @}

/** Create an Expr that that guarantees a precondition.
 * If 'condition' is true, the return value is equal to the first Expr.
 * If 'condition' is false, halide_error() is called, and the return value
 * is arbitrary. Any additional arguments after the first Expr are stringified
 * and passed as a user-facing message to halide_error(), similar to print().
 *
 * Note that this essentially *always* inserts a runtime check into the
 * generated code (except when the condition can be proven at compile time);
 * as such, it should be avoided inside inner loops, except for debugging
 * or testing purposes. Note also that it does not vectorize cleanly (vector
 * values will be scalarized for the check).
 *
 * However, using this to make assertions about (say) input values
 * can be useful, both in terms of correctness and (potentially) in terms
 * of code generation, e.g.
 \code
 Param<int> p;
 Expr y = require(p > 0, p);
 \endcode
 * will allow the optimizer to assume positive, nonzero values for y.
 */
// @{
Expr require(Expr condition, const std::vector<Expr> &values);

template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE Expr require(Expr condition, Expr value, Args &&...args) {
    std::vector<Expr> collected_args = {std::move(value)};
    Internal::collect_print_args(collected_args, std::forward<Args>(args)...);
    return require(std::move(condition), collected_args);
}
// @}

/** Return an undef value of the given type. Halide skips stores that
 * depend on undef values, so you can use this to mean "do not modify
 * this memory location". This is an escape hatch that can be used for
 * several things:
 *
 * You can define a reduction with no pure step, by setting the pure
 * step to undef. Do this only if you're confident that the update
 * steps are sufficient to correctly fill in the domain.
 *
 * For a tuple-valued reduction, you can write an update step that
 * only updates some tuple elements.
 *
 * You can define single-stage pipeline that only has update steps,
 * and depends on the values already in the output buffer.
 *
 * Use this feature with great caution, as you can use it to load from
 * uninitialized memory.
 */
Expr undef(Type t);

template<typename T>
inline Expr undef() {
    return undef(type_of<T>());
}

namespace Internal {

/** Return an expression that should never be evaluated. Expressions
 * that depend on unreachabale values are also unreachable, and
 * statements that execute unreachable expressions are also considered
 * unreachable. */
Expr unreachable(Type t = Int(32));

template<typename T>
inline Expr unreachable() {
    return unreachable(type_of<T>());
}

}  // namespace Internal

/** Control the values used in the memoization cache key for memoize.
 * Normally parameters and other external dependencies are
 * automatically inferred and added to the cache key. The memoize_tag
 * operator allows computing one expression and using either the
 * computed value, or one or more other expressions in the cache key
 * instead of the parameter dependencies of the computation. The
 * single argument version is completely safe in that the cache key
 * will use the actual computed value -- it is difficult or imposible
 * to produce erroneous caching this way. The more-than-one argument
 * version allows generating cache keys that do not uniquely identify
 * the computation and thus can result in caching errors.
 *
 * A potential use for the single argument version is to handle a
 * floating-point parameter that is quantized to a small
 * integer. Mutliple values of the float will produce the same integer
 * and moving the caching to using the integer for the key is more
 * efficient.
 *
 * The main use for the more-than-one argument version is to provide
 * cache key information for Handles and ImageParams, which otherwise
 * are not allowed inside compute_cached operations. E.g. when passing
 * a group of parameters to an external array function via a Handle,
 * memoize_tag can be used to isolate the actual values used by that
 * computation. If an ImageParam is a constant image with a persistent
 * digest, memoize_tag can be used to key computations using that image
 * on the digest. */
// @{
template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE Expr memoize_tag(Expr result, Args &&...args) {
    std::vector<Expr> collected_args{std::forward<Args>(args)...};
    return Internal::memoize_tag_helper(std::move(result), collected_args);
}
// @}

/** Expressions tagged with this intrinsic are considered to be part
 * of the steady state of some loop with a nasty beginning and end
 * (e.g. a boundary condition). When Halide encounters likely
 * intrinsics, it splits the containing loop body into three, and
 * tries to simplify down all conditions that lead to the likely. For
 * example, given the expression: select(x < 1, bar, x > 10, bar,
 * likely(foo)), Halide will split the loop over x into portions where
 * x < 1, 1 <= x <= 10, and x > 10.
 *
 * You're unlikely to want to call this directly. You probably want to
 * use the boundary condition helpers in the BoundaryConditions
 * namespace instead.
 */
Expr likely(Expr e);

/** Equivalent to likely, but only triggers a loop partitioning if
 * found in an innermost loop. */
Expr likely_if_innermost(Expr e);

/** Cast an expression to the halide type corresponding to the C++
 * type T. As part of the cast, clamp to the minimum and maximum
 * values of the result type. */
template<typename T>
Expr saturating_cast(Expr e) {
    return saturating_cast(type_of<T>(), std::move(e));
}

/** Cast an expression to a new type, clamping to the minimum and
 * maximum values of the result type. */
Expr saturating_cast(Type t, Expr e);

/** Makes a best effort attempt to preserve IEEE floating-point
 * semantics in evaluating an expression. May not be implemented for
 * all backends. (E.g. it is difficult to do this for C++ code
 * generation as it depends on the compiler flags used to compile the
 * generated code. */
Expr strict_float(const Expr &e);

/** Create an Expr that that promises another Expr is clamped but do
 * not generate code to check the assertion or modify the value. No
 * attempt is made to prove the bound at compile time. (If it is
 * proved false as a result of something else, an error might be
 * generated, but it is also possible the compiler will crash.) The
 * promised bound is used in bounds inference so it will allow
 * satisfying bounds checks as well as possibly aiding optimization.
 *
 * unsafe_promise_clamped returns its first argument, the Expr 'value'
 *
 * This is a very easy way to make Halide generate erroneous code if
 * the bound promises is not kept. Use sparingly when there is no
 * other way to convey the information to the compiler and it is
 * required for a valuable optimization.
 *
 * Unsafe promises can be checked by turning on
 * Target::CheckUnsafePromises. This is intended for debugging only.
 */
Expr unsafe_promise_clamped(const Expr &value, const Expr &min, const Expr &max);

namespace Internal {
/**
 * FOR INTERNAL USE ONLY.
 *
 * An entirely unchecked version of unsafe_promise_clamped, used
 * inside the compiler as an annotation of the known bounds of an Expr
 * when it has proved something is bounded and wants to record that
 * fact for later passes (notably bounds inference) to exploit. This
 * gets introduced by GuardWithIf tail strategies, because the bounds
 * machinery has a hard time exploiting if statement conditions.
 *
 * Unlike unsafe_promise_clamped, this expression is
 * context-dependent, because 'value' might be statically bounded at
 * some point in the IR (e.g. due to a containing if statement), but
 * not elsewhere.
 *
 * This intrinsic always evaluates to its first argument. If this value is
 * used by a side-effecting operation and it is outside the range specified
 * by its second and third arguments, behavior is undefined. The compiler can
 * therefore assume that the value is within the range given and optimize
 * accordingly. Note that this permits promise_clamped to evaluate to
 * something outside of the range, provided that this value is not used.
 *
 * Note that this produces an intrinsic that is marked as 'pure' and thus is
 * allowed to be hoisted, etc.; thus, extra care must be taken with its use.
 **/
Expr promise_clamped(const Expr &value, const Expr &min, const Expr &max);
}  // namespace Internal

/** Scatter and gather are used for update definition which must store
 * multiple values to distinct locations at the same time. The
 * multiple expressions on the right-hand-side are bundled together
 * into a "gather", which must match a "scatter" the the same number
 * of arguments on the left-hand-size. For example, to store the
 * values 1 and 2 to the locations (x, y, 3) and (x, y, 4),
 * respectively:
 *
\code
f(x, y, scatter(3, 4)) = gather(1, 2);
\endcode
 *
 * The result of gather or scatter can be treated as an
 * expression. Any containing operations on it can be assumed to
 * distribute over the elements. If two gather expressions are
 * combined with an arithmetic operator (e.g. added), they combine
 * element-wise. The following example stores the values 2 * x, 2 * y,
 * and 2 * c to the locations (x + 1, y, c), (x, y + 3, c), and (x, y,
 * c + 2) respectively:
 *
\code
f(x + scatter(1, 0, 0), y + scatter(0, 3, 0), c + scatter(0, 0, 2)) = 2 * gather(x, y, c);
\endcode
*
* Repeated values in the scatter cause multiple stores to the same
* location. The stores happen in order from left to right, so the
* rightmost value wins. The following code is equivalent to f(x) = 5
*
\code
f(scatter(x, x)) = gather(3, 5);
\endcode
*
* Gathers are most useful for algorithms which require in-place
* swapping or permutation of multiple elements, or other kinds of
* in-place mutations that require loading multiple inputs, doing some
* operations to them jointly, then storing them again. The following
* update definition swaps the values of f at locations 3 and 5 if an
* input parameter p is true:
*
\code
f(scatter(3, 5)) = f(select(p, gather(5, 3), gather(3, 5)));
\endcode
*
* For more examples of the use of scatter and gather, see
* test/correctness/multiple_scatter.cpp
*
* It is not currently possible to use scatter and gather to write an
* update definition in which the *number* of values loaded or stored
* varies, as the size of the scatter/gather packet must be fixed a
* compile-time. A workaround is to make the unwanted extra operations
* a redundant copy of the last operation, which will be
* dead-code-eliminated by the compiler. For example, the following
* update definition swaps the values at locations 3 and 5 when the
* parameter p is true, and rotates the values at locations 1, 2, and 3
* when it is false. The load from 3 and store to 5 will be redundantly
* repeated:
*
\code
f(select(p, scatter(3, 5, 5), scatter(1, 2, 3))) = f(select(p, gather(5, 3, 3), gather(2, 3, 1)));
\endcode
*
* Note that in the p == true case, we redundantly load from 3 and write
* to 5 twice.
*/
//@{
Expr scatter(const std::vector<Expr> &args);
Expr gather(const std::vector<Expr> &args);

template<typename... Args>
Expr scatter(const Expr &e, Args &&...args) {
    return scatter({e, std::forward<Args>(args)...});
}

template<typename... Args>
Expr gather(const Expr &e, Args &&...args) {
    return gather({e, std::forward<Args>(args)...});
}
// @}

/** Extract a contiguous subsequence of the bits of 'e', starting at the bit
 * index given by 'lsb', where zero is the least-significant bit, returning a
 * value of type 't'. Any out-of-range bits requested are filled with zeros.
 *
 * extract_bits is especially useful when one wants to load a small vector of a
 * wide type, and treat it as a larger vector of a smaller type. For example,
 * loading a vector of 32 uint8 values from a uint32 Func can be done as
 * follows:
\code
f8(x) = extract_bits<uint8_t>(f32(x/4), 8*(x%4));
f8.align_bounds(x, 4).vectorize(x, 32);
\endcode
 * Note that the align_bounds call is critical so that the narrow Exprs are
 * aligned to the wider Exprs. This makes the x%4 term collapse to a
 * constant. If f8 is an output Func, then constraining the min value of x to be
 * a known multiple of four would also be sufficient, e.g. via:
\code
f8.output_buffer().dim(0).set_min(0);
\endcode
 *
 * See test/correctness/extract_concat_bits.cpp for a complete example. */
// @{
Expr extract_bits(Type t, const Expr &e, const Expr &lsb);

template<typename T>
Expr extract_bits(const Expr &e, const Expr &lsb) {
    return extract_bits(type_of<T>(), e, lsb);
}
// @}

/** Given a number of Exprs of the same type, concatenate their bits producing a
 * single Expr of the same type code of the input but with more bits. The
 * number of arguments must be a power of two.
 *
 * concat_bits is especially useful when one wants to treat a Func containing
 * values of a narrow type as a Func containing fewer values of a wider
 * type. For example, the following code reinterprets vectors of 32 uint8 values
 * as a vector of 8 uint32s:
 *
\code
f32(x) = concat_bits({f8(4*x), f8(4*x + 1), f8(4*x + 2), f8(4*x + 3)});
f32.vectorize(x, 8);
\endcode
 *
 * See test/correctness/extract_concat_bits.cpp for a complete example.
 */
Expr concat_bits(const std::vector<Expr> &e);

/** Below is a collection of intrinsics for fixed-point programming. Most of
 * them can be expressed via other means, but this is more natural for some, as
 * it avoids ghost widened intermediates that don't (or shouldn't) actually show
 * up in codegen, and doesn't rely on pattern-matching inside the compiler to
 * succeed to get good instruction selection.
 *
 * The semantics of each call are defined in terms of a non-existent 'widen' and
 * 'narrow' operators, which stand in for casts that double or halve the
 * bit-width of a type respectively.
 */

/** Compute a + widen(b). */
Expr widen_right_add(Expr a, Expr b);

/** Compute a * widen(b). */
Expr widen_right_mul(Expr a, Expr b);

/** Compute a - widen(b). */
Expr widen_right_sub(Expr a, Expr b);

/** Compute widen(a) + widen(b). */
Expr widening_add(Expr a, Expr b);

/** Compute widen(a) * widen(b). a and b may have different signedness, in which
 * case the result is signed. */
Expr widening_mul(Expr a, Expr b);

/** Compute widen(a) - widen(b). The result is always signed. */
Expr widening_sub(Expr a, Expr b);

/** Compute widen(a) << b. */
//@{
Expr widening_shift_left(Expr a, Expr b);
Expr widening_shift_left(Expr a, int b);
//@}

/** Compute widen(a) >> b. */
//@{
Expr widening_shift_right(Expr a, Expr b);
Expr widening_shift_right(Expr a, int b);
//@}

/** Compute saturating_narrow(widening_add(a, (1 >> min(b, 0)) / 2) << b).
 * When b is positive indicating a left shift, the rounding term is zero. */
//@{
Expr rounding_shift_left(Expr a, Expr b);
Expr rounding_shift_left(Expr a, int b);
//@}

/** Compute saturating_narrow(widening_add(a, (1 << max(b, 0)) / 2) >> b).
 * When b is negative indicating a left shift, the rounding term is zero. */
//@{
Expr rounding_shift_right(Expr a, Expr b);
Expr rounding_shift_right(Expr a, int b);
//@}

/** Compute saturating_narrow(widen(a) + widen(b)) */
Expr saturating_add(Expr a, Expr b);

/** Compute saturating_narrow(widen(a) - widen(b)) */
Expr saturating_sub(Expr a, Expr b);

/** Compute narrow((widen(a) + widen(b)) / 2) */
Expr halving_add(Expr a, Expr b);

/** Compute narrow((widen(a) + widen(b) + 1) / 2) */
Expr rounding_halving_add(Expr a, Expr b);

/** Compute narrow((widen(a) - widen(b)) / 2) */
Expr halving_sub(Expr a, Expr b);

/** Compute saturating_narrow(shift_right(widening_mul(a, b), q)) */
//@{
Expr mul_shift_right(Expr a, Expr b, Expr q);
Expr mul_shift_right(Expr a, Expr b, int q);
//@}

/** Compute saturating_narrow(rounding_shift_right(widening_mul(a, b), q)) */
//@{
Expr rounding_mul_shift_right(Expr a, Expr b, Expr q);
Expr rounding_mul_shift_right(Expr a, Expr b, int q);
//@}

/** Return a boolean Expr for the corresponding field of the Target
 * being used during lowering; they can be useful in writing library
 * code without having to plumb a Target through call sites, so that you
 * can do things like
 \code
    Expr e = select(target_arch_is(Target::ARM), something, something_else);
 \endcode
 * Note that this doesn't do any checking at runtime to verify that the Target
 * is valid for the current hardware configuration.
 */
//@{
Expr target_arch_is(Target::Arch arch);
Expr target_os_is(Target::OS os);
Expr target_has_feature(Target::Feature feat);
//@}

/** Return the bit width of the Target used during lowering; this can be useful
 * in writing library code without having to plumb a Target through call sites,
 * so that you can do things like
 \code
    Expr e = select(target_bits() == 32, something, something_else);
 \endcode
 * Note that this doesn't do any checking at runtime to verify that the Target
 * is valid for the current hardware configuration.
 */
Expr target_bits();

/** Return the natural vector width for the given Type for the Target
 * being used during lowering; this can be useful in writing library
 * code without having to plumb a Target through call sites, so that you
 * can do things like
 \code
    f.vectorize(x, target_natural_vector_size(Float(32)));
 \endcode
 * Note that this doesn't do any checking at runtime to verify that the Target
 * is valid for the current hardware configuration.
 */
//@{
Expr target_natural_vector_size(Type t);
template<typename data_t>
Expr target_natural_vector_size() {
    return target_natural_vector_size(type_of<data_t>());
}
//@}

}  // namespace Halide

#endif
#ifndef HALIDE_REALIZATION_H
#define HALIDE_REALIZATION_H

#include <cstdint>
#include <vector>


/** \file
 *
 * Defines Realization - a vector of Buffer for use in pipelines with multiple outputs.
 */

namespace Halide {

/** A Realization is a vector of references to existing Buffer objects.
 *  A pipeline with multiple outputs realize to a Realization.  */
class Realization {
private:
    std::vector<Buffer<void>> images;

public:
    /** The number of images in the Realization. */
    size_t size() const;

    /** Get a const reference to one of the images. */
    const Buffer<void> &operator[](size_t x) const;

    /** Get a reference to one of the images. */
    Buffer<void> &operator[](size_t x);

    /** Single-element realizations are implicitly castable to Buffers. */
    template<typename T, int Dims>
    operator Buffer<T, Dims>() const {
        user_assert(images.size() == 1) << "Cannot cast Realization with "
                                        << images.size() << " elements to a Buffer";
        return images[0];
    }

    /** Construct a Realization that acts as a reference to a single
     * existing Buffer. The element type of the Buffer may not be
     * const. */
    // @{
    explicit Realization(const Buffer<void> &e);
    explicit Realization(Buffer<void> &&e);
    // @}

    /** Construct a Realization that refers to the buffers in an
     * existing vector of Buffer<>. The element type of the Buffer(s) may not be
     * const */
    // @{
    explicit Realization(const std::vector<Buffer<void>> &e);
    explicit Realization(std::vector<Buffer<void>> &&e);
    // This ctor allows us to avoid ambiguity when the vector is specified as
    // a braced literal, e.g. `Realization({first, second})`
    explicit Realization(std::initializer_list<Buffer<void>> e)
        : Realization(std::vector<Buffer<void>>{e}) {
    }
    // @}

    /** Call device_sync() for all Buffers in the Realization.
     * If one of the calls returns an error, subsequent Buffers won't have
     * device_sync called; thus callers should consider a nonzero return
     * code to mean that potentially all of the Buffers are in an indeterminate
     * state of sync.
     * Calling this explicitly should rarely be necessary, except for profiling. */
    int device_sync(void *ctx = nullptr);
};

}  // namespace Halide

#endif

namespace Halide {

struct Argument;
class Callable;
class Func;
struct PipelineContents;

/** Special the Autoscheduler to be used (if any), along with arbitrary
 * additional arguments specific to the given Autoscheduler.
 *
 * The 'name' field specifies the type of Autoscheduler
 * to be used (e.g. Adams2019, Mullapudi2016). If this is an empty string,
 * no autoscheduling will be done; if not, it must be the name of a known Autoscheduler.
 *
 * At this time, well-known autoschedulers include:
 *  "Mullapudi2016" -- heuristics-based; the first working autoscheduler; currently built in to libHalide
 *                     see http://graphics.cs.cmu.edu/projects/halidesched/
 *  "Adams2019"     -- aka "the ML autoscheduler"; currently located in apps/autoscheduler
 *                     see https://halide-lang.org/papers/autoscheduler2019.html
 *  "Li2018"        -- aka "the gradient autoscheduler"; currently located in apps/gradient_autoscheduler.
 *                     see https://people.csail.mit.edu/tzumao/gradient_halide
 *
 * The key/value pairs in 'extra' are defined on a per-autoscheduler basis.
 * An autoscheduler can have any number of required or optional keys.
 */
struct AutoschedulerParams {
    std::string name;
    std::map<std::string, std::string> extra;

    AutoschedulerParams() = default;
    /*not-explicit*/ AutoschedulerParams(const std::string &name)
        : name(name) {
    }
    AutoschedulerParams(const std::string &name, const std::map<std::string, std::string> &extra)
        : name(name), extra(extra) {
    }

    std::string to_string() const;
};

namespace Internal {
class IRMutator;
struct JITCache;
struct JITCallArgs;
}  // namespace Internal

/**
 * Used to determine if the output printed to file should be as a normal string
 * or as an HTML file which can be opened in a browerser and manipulated via JS and CSS.*/
enum StmtOutputFormat {
    Text,
    HTML
};

namespace {
// Helper for deleting custom lowering passes. In the header so that
// it goes in user code on windows, where you can have multiple heaps.
template<typename T>
void delete_lowering_pass(T *pass) {
    delete pass;
}
}  // namespace

/** A custom lowering pass. See Pipeline::add_custom_lowering_pass. */
struct CustomLoweringPass {
    Internal::IRMutator *pass;
    std::function<void()> deleter;
};

struct JITExtern;

struct AutoSchedulerResults {
    Target target;                             // Target specified to the autoscheduler
    AutoschedulerParams autoscheduler_params;  // The autoscheduler used, along with its params
    std::string schedule_source;               // The C++ source code of the generated schedule
    std::vector<uint8_t> featurization;        // The featurization of the pipeline (if any)
};

class Pipeline;

using AutoSchedulerFn = std::function<void(const Pipeline &, const Target &, const AutoschedulerParams &, AutoSchedulerResults *outputs)>;

/** A class representing a Halide pipeline. Constructed from the Func
 * or Funcs that it outputs. */
class Pipeline {
public:
    struct RealizationArg {
        // Only one of the following may be non-null
        Realization *r{nullptr};
        halide_buffer_t *buf{nullptr};
        std::unique_ptr<std::vector<Buffer<>>> buffer_list;

        RealizationArg(Realization &r)
            : r(&r) {
        }
        RealizationArg(Realization &&r)
            : r(&r) {
        }
        RealizationArg(halide_buffer_t *buf)
            : buf(buf) {
        }
        template<typename T, int Dims>
        RealizationArg(Runtime::Buffer<T, Dims> &dst)
            : buf(dst.raw_buffer()) {
        }
        template<typename T, int Dims>
        HALIDE_NO_USER_CODE_INLINE RealizationArg(Buffer<T, Dims> &dst)
            : buf(dst.raw_buffer()) {
        }
        template<typename T, int Dims, typename... Args,
                 typename = std::enable_if_t<Internal::all_are_convertible<Buffer<>, Args...>::value>>
        RealizationArg(Buffer<T, Dims> &a, Args &&...args)
            : buffer_list(std::make_unique<std::vector<Buffer<>>>(std::initializer_list<Buffer<>>{a, std::forward<Args>(args)...})) {
        }
        RealizationArg(RealizationArg &&from) = default;

        size_t size() const {
            if (r != nullptr) {
                return r->size();
            } else if (buffer_list) {
                return buffer_list->size();
            }
            return 1;
        }
    };

private:
    Internal::IntrusivePtr<PipelineContents> contents;

    void prepare_jit_call_arguments(RealizationArg &output, const Target &target,
                                    JITUserContext **user_context, bool is_bounds_inference, Internal::JITCallArgs &args_result);

    static std::vector<Internal::JITModule> make_externs_jit_module(const Target &target,
                                                                    std::map<std::string, JITExtern> &externs_in_out);

    static std::map<std::string, AutoSchedulerFn> &get_autoscheduler_map();

    static AutoSchedulerFn find_autoscheduler(const std::string &autoscheduler_name);

    int call_jit_code(const Internal::JITCallArgs &args);

    // Get the value of contents->jit_target, but reality-check that the contents
    // sensibly match the value. Return Target() if not jitted.
    Target get_compiled_jit_target() const;

    static Internal::JITCache compile_jit_cache(const Module &module,
                                                std::vector<Argument> args,
                                                const std::vector<Internal::Function> &outputs,
                                                const std::map<std::string, JITExtern> &jit_externs,
                                                const Target &target_arg);

public:
    /** Make an undefined Pipeline object. */
    Pipeline();

    /** Make a pipeline that computes the given Func. Schedules the
     * Func compute_root(). */
    Pipeline(const Func &output);

    /** Make a pipeline that computes the givens Funcs as
     * outputs. Schedules the Funcs compute_root(). */
    Pipeline(const std::vector<Func> &outputs);

    /** Make a pipeline from deserialization. */
    Pipeline(const std::vector<Func> &outputs, const std::vector<Internal::Stmt> &requirements);

    std::vector<Argument> infer_arguments(const Internal::Stmt &body);

    /** Get the Funcs this pipeline outputs. */
    std::vector<Func> outputs() const;

    /** Get the requirements of this pipeline. */
    std::vector<Internal::Stmt> requirements() const;

    /** Generate a schedule for the pipeline using the specified autoscheduler. */
    AutoSchedulerResults apply_autoscheduler(const Target &target,
                                             const AutoschedulerParams &autoscheduler_params) const;

    /** Add a new the autoscheduler method with the given name. Does not affect the current default autoscheduler.
     * It is an error to call this with the same name multiple times. */
    static void add_autoscheduler(const std::string &autoscheduler_name, const AutoSchedulerFn &autoscheduler);

    /** Return handle to the index-th Func within the pipeline based on the
     * topological order. */
    Func get_func(size_t index);

    /** Compile and generate multiple target files with single call.
     * Deduces target files based on filenames specified in
     * output_files map.
     */
    void compile_to(const std::map<OutputFileType, std::string> &output_files,
                    const std::vector<Argument> &args,
                    const std::string &fn_name,
                    const Target &target);

    /** Statically compile a pipeline to llvm bitcode, with the given
     * filename (which should probably end in .bc), type signature,
     * and C function name. If you're compiling a pipeline with a
     * single output Func, see also Func::compile_to_bitcode. */
    void compile_to_bitcode(const std::string &filename,
                            const std::vector<Argument> &args,
                            const std::string &fn_name,
                            const Target &target = get_target_from_environment());

    /** Statically compile a pipeline to llvm assembly, with the given
     * filename (which should probably end in .ll), type signature,
     * and C function name. If you're compiling a pipeline with a
     * single output Func, see also Func::compile_to_llvm_assembly. */
    void compile_to_llvm_assembly(const std::string &filename,
                                  const std::vector<Argument> &args,
                                  const std::string &fn_name,
                                  const Target &target = get_target_from_environment());

    /** Statically compile a pipeline with multiple output functions to an
     * object file, with the given filename (which should probably end in
     * .o or .obj), type signature, and C function name (which defaults to
     * the same name as this halide function. You probably don't want to
     * use this directly; call compile_to_static_library or compile_to_file instead. */
    void compile_to_object(const std::string &filename,
                           const std::vector<Argument> &,
                           const std::string &fn_name,
                           const Target &target = get_target_from_environment());

    /** Emit a header file with the given filename for a pipeline. The
     * header will define a function with the type signature given by
     * the second argument, and a name given by the third. You don't
     * actually have to have defined any of these functions yet to
     * call this. You probably don't want to use this directly; call
     * compile_to_static_library or compile_to_file instead. */
    void compile_to_header(const std::string &filename,
                           const std::vector<Argument> &,
                           const std::string &fn_name,
                           const Target &target = get_target_from_environment());

    /** Statically compile a pipeline to text assembly equivalent to
     * the object file generated by compile_to_object. This is useful
     * for checking what Halide is producing without having to
     * disassemble anything, or if you need to feed the assembly into
     * some custom toolchain to produce an object file. */
    void compile_to_assembly(const std::string &filename,
                             const std::vector<Argument> &args,
                             const std::string &fn_name,
                             const Target &target = get_target_from_environment());

    /** Statically compile a pipeline to C source code. This is useful
     * for providing fallback code paths that will compile on many
     * platforms. Vectorization will fail, and parallelization will
     * produce serial code. */
    void compile_to_c(const std::string &filename,
                      const std::vector<Argument> &,
                      const std::string &fn_name,
                      const Target &target = get_target_from_environment());

    /** Write out an internal representation of lowered code. Useful
     * for analyzing and debugging scheduling. Can emit html or plain
     * text. */
    void compile_to_lowered_stmt(const std::string &filename,
                                 const std::vector<Argument> &args,
                                 StmtOutputFormat fmt = Text,
                                 const Target &target = get_target_from_environment());

    /** Write out a conceptual representation of lowered code, before any parallel loop
     * get factored out into separate functions, or GPU loops are offloaded to kernel code.r
     * Useful for analyzing and debugging scheduling. Can emit html or plain text. */
    void compile_to_conceptual_stmt(const std::string &filename,
                                    const std::vector<Argument> &args,
                                    StmtOutputFormat fmt = Text,
                                    const Target &target = get_target_from_environment());

    /** Write out the loop nests specified by the schedule for this
     * Pipeline's Funcs. Helpful for understanding what a schedule is
     * doing. */
    void print_loop_nest();

    /** Compile to object file and header pair, with the given
     * arguments. */
    void compile_to_file(const std::string &filename_prefix,
                         const std::vector<Argument> &args,
                         const std::string &fn_name,
                         const Target &target = get_target_from_environment());

    /** Compile to static-library file and header pair, with the given
     * arguments. */
    void compile_to_static_library(const std::string &filename_prefix,
                                   const std::vector<Argument> &args,
                                   const std::string &fn_name,
                                   const Target &target = get_target_from_environment());

    /** Compile to static-library file and header pair once for each target;
     * each resulting function will be considered (in order) via halide_can_use_target_features()
     * at runtime, with the first appropriate match being selected for subsequent use.
     * This is typically useful for specializations that may vary unpredictably by machine
     * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
     * All targets must have identical arch-os-bits.
     */
    void compile_to_multitarget_static_library(const std::string &filename_prefix,
                                               const std::vector<Argument> &args,
                                               const std::vector<Target> &targets);

    /** Like compile_to_multitarget_static_library(), except that the object files
     * are all output as object files (rather than bundled into a static library).
     *
     * `suffixes` is an optional list of strings to use for as the suffix for each object
     * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
     * will be used for each suffix.)
     *
     * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
     * will be generated with the filename `${filename_prefix}_wrapper.o`
     *
     * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
     * will be generated with the filename `${filename_prefix}_runtime.o`
     */
    void compile_to_multitarget_object_files(const std::string &filename_prefix,
                                             const std::vector<Argument> &args,
                                             const std::vector<Target> &targets,
                                             const std::vector<std::string> &suffixes);

    /** Create an internal representation of lowered code as a self
     * contained Module suitable for further compilation. */
    Module compile_to_module(const std::vector<Argument> &args,
                             const std::string &fn_name,
                             const Target &target = get_target_from_environment(),
                             LinkageType linkage_type = LinkageType::ExternalPlusMetadata);

    /** Eagerly jit compile the function to machine code. This
     * normally happens on the first call to realize. If you're
     * running your halide pipeline inside time-sensitive code and
     * wish to avoid including the time taken to compile a pipeline,
     * then you can call this ahead of time. Default is to use the Target
     * returned from Halide::get_jit_target_from_environment()
     */
    void compile_jit(const Target &target = get_jit_target_from_environment());

    /** Eagerly jit compile the function to machine code and return a callable
     * struct that behaves like a function pointer. The calling convention
     * will exactly match that of an AOT-compiled version of this Func
     * with the same Argument list.
     */
    Callable compile_to_callable(const std::vector<Argument> &args,
                                 const Target &target = get_jit_target_from_environment());

    /** Install a set of external C functions or Funcs to satisfy
     * dependencies introduced by HalideExtern and define_extern
     * mechanisms. These will be used by calls to realize,
     * infer_bounds, and compile_jit. */
    void set_jit_externs(const std::map<std::string, JITExtern> &externs);

    /** Return the map of previously installed externs. Is an empty
     * map unless set otherwise. */
    const std::map<std::string, JITExtern> &get_jit_externs();

    /** Get a struct containing the currently set custom functions
     * used by JIT. This can be mutated. Changes will take effect the
     * next time this Pipeline is realized. */
    JITHandlers &jit_handlers();

    /** Add a custom pass to be used during lowering. It is run after
     * all other lowering passes. Can be used to verify properties of
     * the lowered Stmt, instrument it with extra code, or otherwise
     * modify it. The Func takes ownership of the pass, and will call
     * delete on it when the Func goes out of scope. So don't pass a
     * stack object, or share pass instances between multiple
     * Funcs. */
    template<typename T>
    void add_custom_lowering_pass(T *pass) {
        // Template instantiate a custom deleter for this type, then
        // wrap in a lambda. The custom deleter lives in user code, so
        // that deletion is on the same heap as construction (I hate Windows).
        add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
    }

    /** Add a custom pass to be used during lowering, with the
     * function that will be called to delete it also passed in. Set
     * it to nullptr if you wish to retain ownership of the object. */
    void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);

    /** Remove all previously-set custom lowering passes */
    void clear_custom_lowering_passes();

    /** Get the custom lowering passes. */
    const std::vector<CustomLoweringPass> &custom_lowering_passes();

    /** See Func::realize */
    Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target());

    /** Same as above, but takes a custom user-provided context to be
     * passed to runtime functions. A nullptr context is legal, and is
     * equivalent to calling the variant of realize that does not take
     * a context. */
    Realization realize(JITUserContext *context,
                        std::vector<int32_t> sizes = {},
                        const Target &target = Target());

    /** Evaluate this Pipeline into an existing allocated buffer or
     * buffers. If the buffer is also one of the arguments to the
     * function, strange things may happen, as the pipeline isn't
     * necessarily safe to run in-place. The realization should
     * contain one Buffer per tuple component per output Func. For
     * each individual output Func, all Buffers must have the same
     * shape, but the shape can vary across the different output
     * Funcs. This form of realize does *not* automatically copy data
     * back from the GPU. */
    void realize(RealizationArg output, const Target &target = Target());

    /** Same as above, but takes a custom user-provided context to be
     * passed to runtime functions. A nullptr context is legal, and
     * is equivalent to calling the variant of realize that does not
     * take a context. */
    void realize(JITUserContext *context,
                 RealizationArg output,
                 const Target &target = Target());

    /** For a given size of output, or a given set of output buffers,
     * determine the bounds required of all unbound ImageParams
     * referenced. Communicates the result by allocating new buffers
     * of the appropriate size and binding them to the unbound
     * ImageParams. */
    // @{
    void infer_input_bounds(const std::vector<int32_t> &sizes,
                            const Target &target = get_jit_target_from_environment());
    void infer_input_bounds(RealizationArg output,
                            const Target &target = get_jit_target_from_environment());
    // @}

    /** Variants of infer_inputs_bounds that take a custom user context */
    // @{
    void infer_input_bounds(JITUserContext *context,
                            const std::vector<int32_t> &sizes,
                            const Target &target = get_jit_target_from_environment());
    void infer_input_bounds(JITUserContext *context,
                            RealizationArg output,
                            const Target &target = get_jit_target_from_environment());
    // @}

    /** Infer the arguments to the Pipeline, sorted into a canonical order:
     * all buffers (sorted alphabetically by name), followed by all non-buffers
     * (sorted alphabetically by name).
     This lets you write things like:
     \code
     pipeline.compile_to_assembly("/dev/stdout", pipeline.infer_arguments());
     \endcode
     */
    std::vector<Argument> infer_arguments();

    /** Check if this pipeline object is defined. That is, does it
     * have any outputs? */
    bool defined() const;

    /** Invalidate any internal cached state, e.g. because Funcs have
     * been rescheduled. */
    void invalidate_cache();

    /** Add a top-level precondition to the generated pipeline,
     * expressed as a boolean Expr. The Expr may depend on parameters
     * only, and may not call any Func or use a Var. If the condition
     * is not true at runtime, the pipeline will call halide_error
     * with the remaining arguments, and return
     * halide_error_code_requirement_failed. Requirements are checked
     * in the order added. */
    // @{
    void add_requirement(const Expr &condition, const std::vector<Expr> &error_args);

    template<typename... Args,
             typename = std::enable_if_t<Internal::all_are_printable_args<Args...>::value>>
    HALIDE_NO_USER_CODE_INLINE void add_requirement(const Expr &condition, Args &&...error_args) {
        std::vector<Expr> collected_args;
        Internal::collect_print_args(collected_args, std::forward<Args>(error_args)...);
        add_requirement(condition, collected_args);
    }
    // @}

    /** Generate begin_pipeline and end_pipeline tracing calls for this pipeline. */
    void trace_pipeline();

private:
    std::string generate_function_name() const;
};

struct ExternSignature {
private:
    Type ret_type_;  // Only meaningful if is_void_return is false; must be default value otherwise
    bool is_void_return_{false};
    std::vector<Type> arg_types_;

public:
    ExternSignature() = default;

    ExternSignature(const Type &ret_type, bool is_void_return, const std::vector<Type> &arg_types)
        : ret_type_(ret_type),
          is_void_return_(is_void_return),
          arg_types_(arg_types) {
        internal_assert(!(is_void_return && ret_type != Type()));
    }

    template<typename RT, typename... Args>
    explicit ExternSignature(RT (*f)(Args... args))
        : ret_type_(type_of<RT>()),
          is_void_return_(std::is_void_v<RT>),
          arg_types_({type_of<Args>()...}) {
    }

    const Type &ret_type() const {
        internal_assert(!is_void_return());
        return ret_type_;
    }

    bool is_void_return() const {
        return is_void_return_;
    }

    const std::vector<Type> &arg_types() const {
        return arg_types_;
    }

    friend std::ostream &operator<<(std::ostream &stream, const ExternSignature &sig) {
        if (sig.is_void_return_) {
            stream << "void";
        } else {
            stream << sig.ret_type_;
        }
        stream << " (*)(";
        bool comma = false;
        for (const auto &t : sig.arg_types_) {
            if (comma) {
                stream << ", ";
            }
            stream << t;
            comma = true;
        }
        stream << ")";
        return stream;
    }
};

struct ExternCFunction {
private:
    void *address_{nullptr};
    ExternSignature signature_;

public:
    ExternCFunction() = default;

    ExternCFunction(void *address, const ExternSignature &signature)
        : address_(address), signature_(signature) {
    }

    template<typename RT, typename... Args>
    ExternCFunction(RT (*f)(Args... args))
        : ExternCFunction((void *)f, ExternSignature(f)) {
    }

    void *address() const {
        return address_;
    }
    const ExternSignature &signature() const {
        return signature_;
    }
};

struct JITExtern {
private:
    // Note that exactly one of pipeline_ and extern_c_function_
    // can be set in a given JITExtern instance.
    Pipeline pipeline_;
    ExternCFunction extern_c_function_;

public:
    explicit JITExtern(Pipeline pipeline);
    explicit JITExtern(const Func &func);
    explicit JITExtern(const ExternCFunction &extern_c_function);

    template<typename RT, typename... Args>
    explicit JITExtern(RT (*f)(Args... args))
        : JITExtern(ExternCFunction(f)) {
    }

    const Pipeline &pipeline() const {
        return pipeline_;
    }
    const ExternCFunction &extern_c_function() const {
        return extern_c_function_;
    }
};

}  // namespace Halide

#endif
#ifndef HALIDE_RDOM_H
#define HALIDE_RDOM_H

/** \file
 * Defines the front-end syntax for reduction domains and reduction
 * variables.
 */

#include <iostream>
#include <string>
#include <utility>
#include <vector>


namespace Halide {

template<typename T, int Dims>
class Buffer;
class OutputImageParam;

/** A reduction variable represents a single dimension of a reduction
 * domain (RDom). Don't construct them directly, instead construct an
 * RDom, and use RDom::operator[] to get at the variables. For
 * single-dimensional reduction domains, you can just cast a
 * single-dimensional RDom to an RVar. */
class RVar {
    std::string _name;
    Internal::ReductionDomain _domain;
    int _index = -1;

    const Internal::ReductionVariable &_var() const {
        const auto &d = _domain.domain();
        internal_assert(_index >= 0 && _index < (int)d.size());
        return d.at(_index);
    }

public:
    /** An empty reduction variable. */
    RVar()
        : _name(Internal::unique_name('r')) {
    }

    /** Construct an RVar with the given name */
    explicit RVar(const std::string &n)
        : _name(n) {
    }

    /** Construct a reduction variable with the given name and
     * bounds. Must be a member of the given reduction domain. */
    RVar(Internal::ReductionDomain domain, int index)
        : _domain(std::move(domain)), _index(index) {
    }

    /** The minimum value that this variable will take on */
    Expr min() const;

    /** The number that this variable will take on. The maximum value
     * of this variable will be min() + extent() - 1 */
    Expr extent() const;

    /** The reduction domain this is associated with. */
    Internal::ReductionDomain domain() const {
        return _domain;
    }

    /** The name of this reduction variable */
    const std::string &name() const;

    /** Reduction variables can be used as expressions. */
    operator Expr() const;
};

/** A multi-dimensional domain over which to iterate. Used when
 * defining functions with update definitions.
 *
 * An reduction is a function with a two-part definition. It has an
 * initial value, which looks much like a pure function, and an update
 * definition, which may refer to some RDom. Evaluating such a
 * function first initializes it over the required domain (which is
 * inferred based on usage), and then runs update rule for all points
 * in the RDom. For example:
 *
 \code
 Func f;
 Var x;
 RDom r(0, 10);
 f(x) = x; // the initial value
 f(r) = f(r) * 2;
 Buffer<int> result = f.realize({10});
 \endcode
 *
 * This function creates a single-dimensional buffer of size 10, in
 * which element x contains the value x*2. Internally, first the
 * initialization rule fills in x at every site, and then the update
 * definition doubles every site.
 *
 * One use of reductions is to build a function recursively (pure
 * functions in halide cannot be recursive). For example, this
 * function fills in an array with the first 20 fibonacci numbers:
 *
 \code
 Func f;
 Var x;
 RDom r(2, 18);
 f(x) = 1;
 f(r) = f(r-1) + f(r-2);
 \endcode
 *
 * Another use of reductions is to perform scattering operations, as
 * unlike a pure function declaration, the left-hand-side of an update
 * definition may contain general expressions:
 *
 \code
 ImageParam input(UInt(8), 2);
 Func histogram;
 Var x;
 RDom r(input); // Iterate over all pixels in the input
 histogram(x) = 0;
 histogram(input(r.x, r.y)) = histogram(input(r.x, r.y)) + 1;
 \endcode
 *
 * An update definition may also be multi-dimensional. This example
 * computes a summed-area table by first summing horizontally and then
 * vertically:
 *
 \code
 ImageParam input(Float(32), 2);
 Func sum_x, sum_y;
 Var x, y;
 RDom r(input);
 sum_x(x, y)     = input(x, y);
 sum_x(r.x, r.y) = sum_x(r.x, r.y) + sum_x(r.x-1, r.y);
 sum_y(x, y)     = sum_x(x, y);
 sum_y(r.x, r.y) = sum_y(r.x, r.y) + sum_y(r.x, r.y-1);
 \endcode
 *
 * You can also mix pure dimensions with reduction variables. In the
 * previous example, note that there's no need for the y coordinate in
 * sum_x to be traversed serially. The sum within each row is entirely
 * independent. The rows could be computed in parallel, or in a
 * different order, without changing the meaning. Therefore, we can
 * instead write this definition as follows:
 *
 \code
 ImageParam input(Float(32), 2);
 Func sum_x, sum_y;
 Var x, y;
 RDom r(input);
 sum_x(x, y)   = input(x, y);
 sum_x(r.x, y) = sum_x(r.x, y) + sum_x(r.x-1, y);
 sum_y(x, y)   = sum_x(x, y);
 sum_y(x, r.y) = sum_y(x, r.y) + sum_y(x, r.y-1);
 \endcode
 *
 * This lets us schedule it more flexibly. You can now parallelize the
 * update step of sum_x over y by calling:
 \code
 sum_x.update().parallel(y).
 \endcode
 *
 * Note that calling sum_x.parallel(y) only parallelizes the
 * initialization step, and not the update step! Scheduling the update
 * step of a reduction must be done using the handle returned by
 * \ref Func::update(). This code parallelizes both the initialization
 * step and the update step:
 *
 \code
 sum_x.parallel(y);
 sum_x.update().parallel(y);
 \endcode
 *
 * When you mix reduction variables and pure dimensions, the reduction
 * domain is traversed outermost. That is, for each point in the
 * reduction domain, the inferred pure domain is traversed in its
 * entirety. For the above example, this means that sum_x walks down
 * the columns, and sum_y walks along the rows. This may not be
 * cache-coherent. You may try reordering these dimensions using the
 * schedule, but Halide will return an error if it decides that this
 * risks changing the meaning of your function. The solution lies in
 * clever scheduling. If we say:
 *
 \code
 sum_x.compute_at(sum_y, y);
 \endcode
 *
 * Then the sum in x is computed only as necessary for each scanline
 * of the sum in y. This not only results in sum_x walking along the
 * rows, it also improves the locality of the entire pipeline.
 */
class RDom {
    Internal::ReductionDomain dom;

    void init_vars(const std::string &name);

    void validate_min_extent(const Expr &min, const Expr &extent);
    void initialize_from_region(const Region &region, std::string name = "");

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE void initialize_from_region(Region &region, const Expr &min, const Expr &extent, Args &&...args) {
        validate_min_extent(min, extent);
        region.emplace_back(min, extent);
        initialize_from_region(region, std::forward<Args>(args)...);
    }

public:
    /** Construct an undefined reduction domain. */
    RDom() = default;

    /** Construct a multi-dimensional reduction domain with the given name. If the name
     * is left blank, a unique one is auto-generated. */
    // @{
    HALIDE_NO_USER_CODE_INLINE RDom(const Region &region, std::string name = "") {
        initialize_from_region(region, std::move(name));
    }

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE RDom(const Expr &min, const Expr &extent, Args &&...args) {
        // This should really just be a delegating constructor, but I couldn't make
        // that work with variadic template unpacking in visual studio 2013
        Region region;
        initialize_from_region(region, min, extent, std::forward<Args>(args)...);
    }
    // @}

    /** Construct a reduction domain that iterates over all points in
     * a given Buffer or ImageParam. Has the same dimensionality as
     * the argument. */
    // @{
    RDom(const Buffer<void, -1> &);
    RDom(const OutputImageParam &);
    template<typename T, int Dims>
    HALIDE_NO_USER_CODE_INLINE RDom(const Buffer<T, Dims> &im)
        : RDom(Buffer<void, -1>(im)) {
    }
    // @}

    /** Construct a reduction domain that wraps an Internal ReductionDomain object. */
    RDom(const Internal::ReductionDomain &d);

    /** Get at the internal reduction domain object that this wraps. */
    Internal::ReductionDomain domain() const {
        return dom;
    }

    /** Check if this reduction domain is non-null */
    bool defined() const {
        return dom.defined();
    }

    /** Compare two reduction domains for equality of reference */
    bool same_as(const RDom &other) const {
        return dom.same_as(other.dom);
    }

    /** Get the dimensionality of a reduction domain */
    int dimensions() const;

    /** Get at one of the dimensions of the reduction domain */
    RVar operator[](int) const;

    /** Single-dimensional reduction domains can be used as RVars directly. */
    operator RVar() const;

    /** Single-dimensional reduction domains can be also be used as Exprs directly. */
    operator Expr() const;

    /** Add a predicate to the RDom. An RDom may have multiple
     * predicates associated with it. An update definition that uses
     * an RDom only iterates over the subset points in the domain for
     * which all of its predicates are true. The predicate expression
     * obeys the same rules as the expressions used on the
     * right-hand-side of the corresponding update definition. It may
     * refer to the RDom's variables and free variables in the Func's
     * update definition. It may include calls to other Funcs, or make
     * recursive calls to the same Func. This permits iteration over
     * non-rectangular domains, or domains with sizes that vary with
     * some free variable, or domains with shapes determined by some
     * other Func.
     *
     * Note that once RDom is used in the update definition of some
     * Func, no new predicates can be added to the RDom.
     *
     * Consider a simple example:
     \code
     RDom r(0, 20, 0, 20);
     r.where(r.x < r.y);
     r.where(r.x == 10);
     r.where(r.y > 13);
     f(r.x, r.y) += 1;
     \endcode
     * This is equivalent to:
     \code
     for (int r.y = 0; r.y < 20; r.y++) {
       if (r.y > 13) {
         for (int r.x = 0; r.x < 20; r.x++) {
           if (r.x == 10) {
             if (r.x < r.y) {
               f[r.x, r.y] += 1;
             }
           }
         }
       }
     }
     \endcode
     *
     * Where possible Halide restricts the range of the containing for
     * loops to avoid the cases where the predicate is false so that
     * the if statement can be removed entirely. The case above would
     * be further simplified into:
     *
     \code
     for (int r.y = 14; r.y < 20; r.y++) {
       f[10, r.y] += 1;
     }
     \endcode
     *
     * In general, the predicates that we can simplify away by
     * restricting loop ranges are inequalities that compare an inner
     * Var or RVar to some expression in outer Vars or RVars.
     *
     * You can also pack multiple conditions into one predicate like so:
     *
     \code
     RDom r(0, 20, 0, 20);
     r.where((r.x < r.y) && (r.x == 10) && (r.y > 13));
     f(r.x, r.y) += 1;
     \endcode
     *
     */
    void where(Expr predicate);

    /** Direct access to the first four dimensions of the reduction
     * domain. Some of these variables may be undefined if the
     * reduction domain has fewer than four dimensions. */
    // @{
    RVar x, y, z, w;
    // @}
};

/** Emit an RVar in a human-readable form */
std::ostream &operator<<(std::ostream &stream, const RVar &);

/** Emit an RDom in a human-readable form. */
std::ostream &operator<<(std::ostream &stream, const RDom &);
}  // namespace Halide

#endif
#ifndef HALIDE_VAR_H
#define HALIDE_VAR_H

/** \file
 * Defines the Var - the front-end variable
 */
#include <string>
#include <vector>


namespace Halide {

/** A Halide variable, to be used when defining functions. It is just
 * a name, and can be reused in places where no name conflict will
 * occur. It can be used in the left-hand-side of a function
 * definition, or as an Expr. As an Expr, it always has type
 * Int(32). */
class Var {
    /* The expression representing the Var. Guaranteed to be an
     * Internal::Variable of type Int(32). Created once on
     * construction of the Var to avoid making a fresh Expr every time
     * the Var is used in a context in which it will be converted to
     * one. */
    Expr e;

public:
    /** Construct a Var with the given name. Unlike Funcs, this will be treated
     * as the same Var as another other Var with the same name, including
     * implicit Vars. */
    Var(const std::string &n);

    /** Construct a Var with an automatically-generated unique name. */
    Var();

    /** Get the name of a Var */
    const std::string &name() const;

    /** Test if two Vars are the same. This simply compares the names. */
    bool same_as(const Var &other) const {
        return name() == other.name();
    }

    /** Implicit var constructor. Implicit variables are injected
     * automatically into a function call if the number of arguments
     * to the function are fewer than its dimensionality and a
     * placeholder ("_") appears in its argument list. Defining a
     * function to equal an expression containing implicit variables
     * similarly appends those implicit variables, in the same order,
     * to the left-hand-side of the definition where the placeholder
     * ('_') appears.
     *
     * For example, consider the definition:
     *
     \code
     Func f, g;
     Var x, y;
     f(x, y) = 3;
     \endcode
     *
     * A call to f with the placeholder symbol _
     * will have implicit arguments injected automatically, so f(2, _)
     * is equivalent to f(2, _0), where _0 = ImplicitVar<0>(), and f(_)
     * (and indeed f when cast to an Expr) is equivalent to f(_0, _1).
     * The following definitions are all equivalent, differing only in the
     * variable names.
     *
     \code
     g(_) = f*3;
     g(_) = f(_)*3;
     g(x, _) = f(x, _)*3;
     g(x, y) = f(x, y)*3;
     \endcode
     *
     * These are expanded internally as follows:
     *
     \code
     g(_0, _1) = f(_0, _1)*3;
     g(_0, _1) = f(_0, _1)*3;
     g(x, _0) = f(x, _0)*3;
     g(x, y) = f(x, y)*3;
     \endcode
     *
     * The following, however, defines g as four dimensional:
     \code
     g(x, y, _) = f*3;
     \endcode
     *
     * It is equivalent to:
     *
     \code
     g(x, y, _0, _1) = f(_0, _1)*3;
     \endcode
     *
     * Expressions requiring differing numbers of implicit variables
     * can be combined. The left-hand-side of a definition injects
     * enough implicit variables to cover all of them:
     *
     \code
     Func h;
     h(x) = x*3;
     g(x) = h + (f + f(x)) * f(x, y);
     \endcode
     *
     * expands to:
     *
     \code
     Func h;
     h(x) = x*3;
     g(x, _0, _1) = h(_0) + (f(_0, _1) + f(x, _0)) * f(x, y);
     \endcode
     *
     * The first ten implicits, _0 through _9, are predeclared in this
     * header and can be used for scheduling. They should never be
     * used as arguments in a declaration or used in a call.
     *
     * While it is possible to use Var::implicit or the predeclared
     * implicits to create expressions that can be treated as small
     * anonymous functions (e.g. Func(_0 + _1)) this is considered
     * poor style. Instead use \ref lambda.
     */
    static Var implicit(int n);

    /** Return whether a variable name is of the form for an implicit argument.
     */
    //{
    static bool is_implicit(const std::string &name);
    bool is_implicit() const {
        return is_implicit(name());
    }
    //}

    /** Return the argument index for a placeholder argument given its
     *  name. Returns 0 for _0, 1 for _1, etc. Returns -1 if
     *  the variable is not of implicit form.
     */
    //{
    static int implicit_index(const std::string &name) {
        return is_implicit(name) ? atoi(name.c_str() + 1) : -1;
    }
    int implicit_index() const {
        return implicit_index(name());
    }
    //}

    /** Test if a var is the placeholder variable _ */
    //{
    static bool is_placeholder(const std::string &name) {
        return name == "_";
    }
    bool is_placeholder() const {
        return is_placeholder(name());
    }
    //}

    /** A Var can be treated as an Expr of type Int(32) */
    operator const Expr &() const {
        return e;
    }

    /** A Var that represents the location outside the outermost loop. */
    static Var outermost() {
        return Var("__outermost");
    }
};

template<int N = -1>
struct ImplicitVar {
    Var to_var() const {
        if (N >= 0) {
            return Var::implicit(N);
        } else {
            return Var("_");
        }
    }

    operator Var() const {
        return to_var();
    }
    operator Expr() const {
        return to_var();
    }
};

/** A placeholder variable for inferred arguments. See \ref Var::implicit */
static constexpr ImplicitVar<> _;

/** The first ten implicit Vars for use in scheduling. See \ref Var::implicit */
// @{
static constexpr ImplicitVar<0> _0;
static constexpr ImplicitVar<1> _1;
static constexpr ImplicitVar<2> _2;
static constexpr ImplicitVar<3> _3;
static constexpr ImplicitVar<4> _4;
static constexpr ImplicitVar<5> _5;
static constexpr ImplicitVar<6> _6;
static constexpr ImplicitVar<7> _7;
static constexpr ImplicitVar<8> _8;
static constexpr ImplicitVar<9> _9;
// @}

namespace Internal {

/** Make a list of unique arguments for definitions with unnamed
    arguments. */
std::vector<Var> make_argument_list(int dimensionality);

}  // namespace Internal

}  // namespace Halide

#endif

#include <map>
#include <utility>

namespace Halide {

class OutputImageParam;

/** A class that can represent Vars or RVars. Used for reorder calls
 * which can accept a mix of either. */
struct VarOrRVar {
    VarOrRVar(const std::string &n, bool r)
        : var(n), rvar(n), is_rvar(r) {
    }
    VarOrRVar(const Var &v)
        : var(v), is_rvar(false) {
    }
    VarOrRVar(const RVar &r)
        : rvar(r), is_rvar(true) {
    }
    VarOrRVar(const RDom &r)
        : rvar(RVar(r)), is_rvar(true) {
    }
    template<int N>
    VarOrRVar(const ImplicitVar<N> &u)
        : var(u), is_rvar(false) {
    }

    const std::string &name() const {
        if (is_rvar) {
            return rvar.name();
        } else {
            return var.name();
        }
    }

    Var var;
    RVar rvar;
    bool is_rvar;
};

class ImageParam;

namespace Internal {
struct AssociativeOp;
class Function;
struct Split;
struct StorageDim;
}  // namespace Internal

/** A single definition of a Func. May be a pure or update definition. */
class Stage {
    /** Reference to the Function this stage (or definition) belongs to. */
    Internal::Function function;
    Internal::Definition definition;
    /** Indicate which stage the definition belongs to (0 for initial
     * definition, 1 for first update, etc.). */
    size_t stage_index;
    /** Pure Vars of the Function (from the init definition). */
    std::vector<Var> dim_vars;

    void set_dim_type(const VarOrRVar &var, Internal::ForType t);
    void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
    void split(const std::string &old, const std::string &outer, const std::string &inner,
               const Expr &factor, bool exact, TailStrategy tail);
    void remove(const std::string &var);

    const std::vector<Internal::StorageDim> &storage_dims() const {
        return function.schedule().storage_dims();
    }

    Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);

    std::pair<std::vector<Internal::Split>, std::vector<Internal::Split>>
    rfactor_validate_args(const std::vector<std::pair<RVar, Var>> &preserved, const Internal::AssociativeOp &prover_result);

public:
    Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
        : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
        internal_assert(definition.defined());

        dim_vars.reserve(function.args().size());
        for (const auto &arg : function.args()) {
            dim_vars.emplace_back(arg);
        }
        internal_assert(definition.args().size() == dim_vars.size());
    }

    /** Return the current StageSchedule associated with this Stage. For
     * introspection only: to modify schedule, use the Func interface. */
    const Internal::StageSchedule &get_schedule() const {
        return definition.schedule();
    }

    /** Return a string describing the current var list taking into
     * account all the splits, reorders, and tiles. */
    std::string dump_argument_list() const;

    /** Return the name of this stage, e.g. "f.update(2)" */
    std::string name() const;

    /** Calling rfactor() on an associative update definition a Func will split
     * the update into an intermediate which computes the partial results and
     * replaces the current update definition with a new definition which merges
     * the partial results. If called on a init/pure definition, this will
     * throw an error. rfactor() will automatically infer the associative reduction
     * operator and identity of the operator. If it can't prove the operation
     * is associative or if it cannot find an identity for that operator, this
     * will throw an error. In addition, commutativity of the operator is required
     * if rfactor() is called on the inner dimension but excluding the outer
     * dimensions.
     *
     * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
     * The rvars not listed in 'preserved' are removed from the original Func and
     * are lifted to the intermediate Func. The remaining rvars (the ones in
     * 'preserved') are made pure in the intermediate Func. The intermediate Func's
     * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
     * applied to the original Func's update definition. The loop order of the
     * intermediate Func's update definition is the same as the original, although
     * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
     * intermediate Func's init definition from innermost to outermost is the args'
     * order of the original Func's init definition followed by the new pure Vars.
     *
     * The intermediate Func also inherits storage order from the original Func
     * with the new pure Vars added to the outermost.
     *
     * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
     \code
     f(x, y) = 0;
     f(x, y) += g(r.x, r.y);
     \endcode
     * into a pipeline like this:
     \code
     f_intm(x, y, u) = 0;
     f_intm(x, y, u) += g(r.x, u);

     f(x, y) = 0;
     f(x, y) += f_intm(x, y, r.y);
     \endcode
     *
     * This has a variety of uses. You can use it to split computation of an associative reduction:
     \code
     f(x, y) = 10;
     RDom r(0, 96);
     f(x, y) = max(f(x, y), g(x, y, r.x));
     f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
     f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
     \endcode
     *
     *, which is equivalent to:
     \code
     parallel for u = 0 to 11:
       for y:
         for x:
           f_intm(x, y, u) = -inf
     parallel for x:
       for y:
         parallel for u = 0 to 11:
           for rxi = 0 to 7:
             f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
     for y:
       for x:
         f(x, y) = 10
     parallel for x:
       for y:
         for rxo = 0 to 11:
           f(x, y) = max(f(x, y), f_intm(x, y, rxo))
     \endcode
     *
     */
    // @{
    Func rfactor(const std::vector<std::pair<RVar, Var>> &preserved);
    Func rfactor(const RVar &r, const Var &v);
    // @}

    /** Schedule the iteration over this stage to be fused with another
     * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
     * be computed AFTER 's' in the innermost fused dimension. There should not
     * be any dependencies between those two fused stages. If either of the
     * stages being fused is a stage of an extern Func, this will throw an error.
     *
     * Note that the two stages that are fused together should have the same
     * exact schedule from the outermost to the innermost fused dimension, and
     * the stage we are calling compute_with on should not have specializations,
     * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
     *
     * Also, if a producer is desired to be computed at the fused loop level,
     * the function passed to the compute_at() needs to be the "parent". Consider
     * the following code:
     \code
     input(x, y) = x + y;
     f(x, y) = input(x, y);
     f(x, y) += 5;
     g(x, y) = x - y;
     g(x, y) += 10;
     f.compute_with(g, y);
     f.update().compute_with(g.update(), y);
     \endcode
     *
     * To compute 'input' at the fused loop level at dimension y, we specify
     * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
     * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
     * is computed). On the other hand, to compute 'input' at the innermost
     * dimension of 'f', we specify input.compute_at(f, x) instead of
     * input.compute_at(g, x) since the x dimension of 'f' is not fused
     * (only the y dimension is).
     *
     * Given the constraints, this has a variety of uses. Consider the
     * following code:
     \code
     f(x, y) = x + y;
     g(x, y) = x - y;
     h(x, y) = f(x, y) + g(x, y);
     f.compute_root();
     g.compute_root();
     f.split(x, xo, xi, 8);
     g.split(x, xo, xi, 8);
     g.compute_with(f, xo);
     \endcode
     *
     * This is equivalent to:
     \code
     for y:
       for xo:
         for xi:
           f(8*xo + xi) = (8*xo + xi) + y
         for xi:
           g(8*xo + xi) = (8*xo + xi) - y
     for y:
       for x:
         h(x, y) = f(x, y) + g(x, y)
     \endcode
     *
     * The size of the dimensions of the stages computed_with do not have
     * to match. Consider the following code where 'g' is half the size of 'f':
     \code
     Image<int> f_im(size, size), g_im(size/2, size/2);
     input(x, y) = x + y;
     f(x, y) = input(x, y);
     g(x, y) = input(2*x, 2*y);
     g.compute_with(f, y);
     input.compute_at(f, y);
     Pipeline({f, g}).realize({f_im, g_im});
     \endcode
     *
     * This is equivalent to:
     \code
     for y = 0 to size-1:
       for x = 0 to size-1:
         input(x, y) = x + y;
       for x = 0 to size-1:
         f(x, y) = input(x, y)
       for x = 0 to size/2-1:
         if (y < size/2-1):
           g(x, y) = input(2*x, 2*y)
     \endcode
     *
     * 'align' specifies how the loop iteration of each dimension of the
     * two stages being fused should be aligned in the fused loop nests
     * (see LoopAlignStrategy for options). Consider the following loop nests:
     \code
     for z = f_min_z to f_max_z:
       for y = f_min_y to f_max_y:
         for x = f_min_x to f_max_x:
           f(x, y, z) = x + y + z
     for z = g_min_z to g_max_z:
       for y = g_min_y to g_max_y:
         for x = g_min_x to g_max_x:
           g(x, y, z) = x - y - z
     \endcode
     *
     * If no alignment strategy is specified, the following loop nest will be
     * generated:
     \code
     for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
       for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
         for x = f_min_x to f_max_x:
           if (f_min_z <= z <= f_max_z):
             if (f_min_y <= y <= f_max_y):
               f(x, y, z) = x + y + z
         for x = g_min_x to g_max_x:
           if (g_min_z <= z <= g_max_z):
             if (g_min_y <= y <= g_max_y):
               g(x, y, z) = x - y - z
     \endcode
     *
     * Instead, these alignment strategies:
     \code
     g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
     \endcode
     * will produce the following loop nest:
     \code
     f_loop_min_z = f_min_z
     f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
     for z = f_min_z to f_loop_max_z:
       f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
       f_loop_max_y = f_max_y
       for y = f_loop_min_y to f_loop_max_y:
         for x = f_min_x to f_max_x:
           if (f_loop_min_z <= z <= f_loop_max_z):
             if (f_loop_min_y <= y <= f_loop_max_y):
               f(x, y, z) = x + y + z
         for x = g_min_x to g_max_x:
           g_shift_z = g_min_z - f_loop_min_z
           g_shift_y = g_max_y - f_loop_max_y
           if (g_min_z <= (z + g_shift_z) <= g_max_z):
             if (g_min_y <= (y + g_shift_y) <= g_max_y):
               g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
     \endcode
     *
     * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
     * of 'g' at dimension z so that its starting value matches that of 'f'.
     * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
     * iteration of 'g' at dimension y so that its end value matches that of 'f'.
     */
    // @{
    Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Stage &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
    Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Stage &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
    // @}

    /** Scheduling calls that control how the domain of this stage is
     * traversed. See the documentation for Func for the meanings. */
    // @{

    Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
    Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
    Stage &serial(const VarOrRVar &var);
    Stage &parallel(const VarOrRVar &var);
    Stage &vectorize(const VarOrRVar &var);
    Stage &unroll(const VarOrRVar &var);
    Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
    Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
    Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
    Stage &partition(const VarOrRVar &var, Partition partition_policy);
    Stage &never_partition_all();
    Stage &never_partition(const std::vector<VarOrRVar> &vars);
    Stage &always_partition_all();
    Stage &always_partition(const std::vector<VarOrRVar> &vars);

    Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
                const VarOrRVar &xo, const VarOrRVar &yo,
                const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
                TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
                const VarOrRVar &xi, const VarOrRVar &yi,
                const Expr &xfactor, const Expr &yfactor,
                TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const std::vector<VarOrRVar> &previous,
                const std::vector<VarOrRVar> &outers,
                const std::vector<VarOrRVar> &inners,
                const std::vector<Expr> &factors,
                const std::vector<TailStrategy> &tails);
    Stage &tile(const std::vector<VarOrRVar> &previous,
                const std::vector<VarOrRVar> &outers,
                const std::vector<VarOrRVar> &inners,
                const std::vector<Expr> &factors,
                TailStrategy tail = TailStrategy::Auto);
    Stage &tile(const std::vector<VarOrRVar> &previous,
                const std::vector<VarOrRVar> &inners,
                const std::vector<Expr> &factors,
                TailStrategy tail = TailStrategy::Auto);
    Stage &reorder(const std::vector<VarOrRVar> &vars);

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>
    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
        return reorder(collected_args);
    }

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>
    never_partition(const VarOrRVar &x, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
        return never_partition(collected_args);
    }

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>
    always_partition(const VarOrRVar &x, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
        return always_partition(collected_args);
    }

    Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
    Stage specialize(const Expr &condition);
    void specialize_fail(const std::string &message);

    Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
               const VarOrRVar &thread_x, const VarOrRVar &thread_y,
               DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
               const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
               DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                    const VarOrRVar &bx, const VarOrRVar &by,
                    const VarOrRVar &tx, const VarOrRVar &ty,
                    const Expr &x_size, const Expr &y_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                    const VarOrRVar &tx, const VarOrRVar &ty,
                    const Expr &x_size, const Expr &y_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                    const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
                    const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                    const Expr &x_size, const Expr &y_size, const Expr &z_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);
    Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                    const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                    const Expr &x_size, const Expr &y_size, const Expr &z_size,
                    TailStrategy tail = TailStrategy::Auto,
                    DeviceAPI device_api = DeviceAPI::Default_GPU);

    Stage &allow_race_conditions();
    Stage &atomic(bool override_associativity_test = false);

    Stage &hexagon(const VarOrRVar &x = Var::outermost());

    Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                    PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    Stage &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                    PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    template<typename T>
    Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                    PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
        return prefetch(image.parameter(), at, from, std::move(offset), strategy);
    }
    // @}

    /** Get the Vars and RVars of this definition, from innermost out, with
     * splits applied. This represents all the potentially-valid compute_at
     * sites for this Stage. The RVars returned will be symbolic and not tied to
     * a particular reduction domain, like the naked RVar objects used as split
     * outputs. Note that this list by default will end with the sentinel
     * Var::outermost. */
    std::vector<VarOrRVar> split_vars() const;

    /** Assert that this stage has intentionally been given no schedule, and
     * suppress the warning about unscheduled update definitions that would
     * otherwise fire. This counts as a schedule, so calling this twice on the
     * same Stage will fail the assertion. */
    void unscheduled();
};

// For backwards compatibility, keep the ScheduleHandle name.
typedef Stage ScheduleHandle;

class FuncTupleElementRef;

/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
 * z are Vars or Exprs. If could be the left hand side of a definition or
 * an update definition, or it could be a call to a function. We don't know
 * until we see how this object gets used.
 */
class FuncRef {
    Internal::Function func;
    int implicit_placeholder_pos;
    int implicit_count;
    std::vector<Expr> args;
    std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;

    /** Helper for function update by Tuple. If the function does not
     * already have a pure definition, init_val will be used as RHS of
     * each tuple element in the initial function definition. */
    template<typename BinaryOp>
    Stage func_ref_update(const Tuple &e, int init_val);

    /** Helper for function update by Expr. If the function does not
     * already have a pure definition, init_val will be used as RHS in
     * the initial function definition. */
    template<typename BinaryOp>
    Stage func_ref_update(const Expr &e, int init_val);

public:
    FuncRef(const Internal::Function &, const std::vector<Expr> &,
            int placeholder_pos = -1, int count = 0);
    FuncRef(Internal::Function, const std::vector<Var> &,
            int placeholder_pos = -1, int count = 0);

    /** Use this as the left-hand-side of a definition or an update definition
     * (see \ref RDom).
     */
    Stage operator=(const Expr &);

    /** Use this as the left-hand-side of a definition or an update definition
     * for a Func with multiple outputs. */
    Stage operator=(const Tuple &);

    /** Define a stage that adds the given expression to this Func. If the
     * expression refers to some RDom, this performs a sum reduction of the
     * expression over the domain. If the function does not already have a
     * pure definition, this sets it to zero.
     */
    // @{
    Stage operator+=(const Expr &);
    Stage operator+=(const Tuple &);
    Stage operator+=(const FuncRef &);
    // @}

    /** Define a stage that adds the negative of the given expression to this
     * Func. If the expression refers to some RDom, this performs a sum reduction
     * of the negative of the expression over the domain. If the function does
     * not already have a pure definition, this sets it to zero.
     */
    // @{
    Stage operator-=(const Expr &);
    Stage operator-=(const Tuple &);
    Stage operator-=(const FuncRef &);
    // @}

    /** Define a stage that multiplies this Func by the given expression. If the
     * expression refers to some RDom, this performs a product reduction of the
     * expression over the domain. If the function does not already have a pure
     * definition, this sets it to 1.
     */
    // @{
    Stage operator*=(const Expr &);
    Stage operator*=(const Tuple &);
    Stage operator*=(const FuncRef &);
    // @}

    /** Define a stage that divides this Func by the given expression.
     * If the expression refers to some RDom, this performs a product
     * reduction of the inverse of the expression over the domain. If the
     * function does not already have a pure definition, this sets it to 1.
     */
    // @{
    Stage operator/=(const Expr &);
    Stage operator/=(const Tuple &);
    Stage operator/=(const FuncRef &);
    // @}

    /* Override the usual assignment operator, so that
     * f(x, y) = g(x, y) defines f.
     */
    Stage operator=(const FuncRef &);

    /** Use this as a call to the function, and not the left-hand-side
     * of a definition. Only works for single-output Funcs. */
    operator Expr() const;

    /** When a FuncRef refers to a function that provides multiple
     * outputs, you can access each output as an Expr using
     * operator[].
     */
    FuncTupleElementRef operator[](int) const;

    /** How many outputs does the function this refers to produce. */
    size_t size() const;

    /** Is this FuncRef syntactically equivalent to another one? */
    bool equivalent_to(const FuncRef &other) const;

    /** What function is this calling? */
    Internal::Function function() const {
        return func;
    }
};

/** Explicit overloads of min and max for FuncRef. These exist to
 * disambiguate calls to min on FuncRefs when a user has pulled both
 * Halide::min and std::min into their namespace. */
// @{
inline Expr min(const FuncRef &a, const FuncRef &b) {
    return min(Expr(a), Expr(b));
}
inline Expr max(const FuncRef &a, const FuncRef &b) {
    return max(Expr(a), Expr(b));
}
// @}

/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
 * z are Vars or Exprs. If could be the left hand side of an update
 * definition, or it could be a call to a function. We don't know
 * until we see how this object gets used.
 */
class FuncTupleElementRef {
    FuncRef func_ref;
    std::vector<Expr> args;  // args to the function
    int idx;                 // Index to function outputs

    /** Helper function that generates a Tuple where element at 'idx' is set
     * to 'e' and the rests are undef. */
    Tuple values_with_undefs(const Expr &e) const;

public:
    FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);

    /** Use this as the left-hand-side of an update definition of Tuple
     * component 'idx' of a Func (see \ref RDom). The function must
     * already have an initial definition.
     */
    Stage operator=(const Expr &e);

    /** Define a stage that adds the given expression to Tuple component 'idx'
     * of this Func. The other Tuple components are unchanged. If the expression
     * refers to some RDom, this performs a sum reduction of the expression over
     * the domain. The function must already have an initial definition.
     */
    Stage operator+=(const Expr &e);

    /** Define a stage that adds the negative of the given expression to Tuple
     * component 'idx' of this Func. The other Tuple components are unchanged.
     * If the expression refers to some RDom, this performs a sum reduction of
     * the negative of the expression over the domain. The function must already
     * have an initial definition.
     */
    Stage operator-=(const Expr &e);

    /** Define a stage that multiplies Tuple component 'idx' of this Func by
     * the given expression. The other Tuple components are unchanged. If the
     * expression refers to some RDom, this performs a product reduction of
     * the expression over the domain. The function must already have an
     * initial definition.
     */
    Stage operator*=(const Expr &e);

    /** Define a stage that divides Tuple component 'idx' of this Func by
     * the given expression. The other Tuple components are unchanged.
     * If the expression refers to some RDom, this performs a product
     * reduction of the inverse of the expression over the domain. The function
     * must already have an initial definition.
     */
    Stage operator/=(const Expr &e);

    /* Override the usual assignment operator, so that
     * f(x, y)[index] = g(x, y) defines f.
     */
    Stage operator=(const FuncRef &e);

    /** Use this as a call to Tuple component 'idx' of a Func, and not the
     * left-hand-side of a definition. */
    operator Expr() const;

    /** What function is this calling? */
    Internal::Function function() const {
        return func_ref.function();
    }

    /** Return index to the function outputs. */
    int index() const {
        return idx;
    }
};

namespace Internal {
class IRMutator;
}  // namespace Internal

/** Helper class for identifying purpose of an Expr passed to memoize.
 */
class EvictionKey {
protected:
    Expr key;
    friend class Func;

public:
    explicit EvictionKey(const Expr &expr = Expr())
        : key(expr) {
    }
};

/** A halide function. This class represents one stage in a Halide
 * pipeline, and is the unit by which we schedule things. By default
 * they are aggressively inlined, so you are encouraged to make lots
 * of little functions, rather than storing things in Exprs. */
class Func {

    /** A handle on the internal halide function that this
     * represents */
    Internal::Function func;

    /** When you make a reference to this function with fewer
     * arguments than it has dimensions, the argument list is bulked
     * up with 'implicit' vars with canonical names. This lets you
     * pass around partially applied Halide functions. */
    // @{
    std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
    std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
    // @}

    /** The imaging pipeline that outputs this Func alone. */
    Pipeline pipeline_;

    /** Get the imaging pipeline that outputs this Func alone,
     * creating it (and freezing the Func) if necessary. */
    Pipeline pipeline();

    // Helper function for recursive reordering support
    Func &reorder_storage(const std::vector<Var> &dims, size_t start);

    void invalidate_cache();

public:
    /** Declare a new undefined function with the given name */
    explicit Func(const std::string &name);

    /** Declare a new undefined function with the given name.
     * The function will be constrained to represent Exprs of required_type.
     * If required_dims is not AnyDims, the function will be constrained to exactly
     * that many dimensions. */
    explicit Func(const Type &required_type, int required_dims, const std::string &name);

    /** Declare a new undefined function with the given name.
     * If required_types is not empty, the function will be constrained to represent
     * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
     * If required_dims is not AnyDims, the function will be constrained to exactly
     * that many dimensions. */
    explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);

    /** Declare a new undefined function with an
     * automatically-generated unique name */
    Func();

    /** Declare a new function with an automatically-generated unique
     * name, and define it to return the given expression (which may
     * not contain free variables). */
    explicit Func(const Expr &e);

    /** Construct a new Func to wrap an existing, already-defined
     * Function object. */
    explicit Func(Internal::Function f);

    /** Construct a new Func to wrap a Buffer. */
    template<typename T, int Dims>
    HALIDE_NO_USER_CODE_INLINE explicit Func(Buffer<T, Dims> &im)
        : Func() {
        (*this)(_) = im(_);
    }

    /** Evaluate this function over some rectangular domain and return
     * the resulting buffer or buffers. Performs compilation if the
     * Func has not previously been realized and compile_jit has not
     * been called. If the final stage of the pipeline is on the GPU,
     * data is copied back to the host before being returned. The
     * returned Realization should probably be instantly converted to
     * a Buffer class of the appropriate type. That is, do this:
     *
     \code
     f(x) = sin(x);
     Buffer<float> im = f.realize(...);
     \endcode
     *
     * If your Func has multiple values, because you defined it using
     * a Tuple, then casting the result of a realize call to a buffer
     * or image will produce a run-time error. Instead you should do the
     * following:
     *
     \code
     f(x) = Tuple(x, sin(x));
     Realization r = f.realize(...);
     Buffer<int> im0 = r[0];
     Buffer<float> im1 = r[1];
     \endcode
     *
     * In Halide formal arguments of a computation are specified using
     * Param<T> and ImageParam objects in the expressions defining the
     * computation. Note that this method is not thread-safe, in that
     * Param<T> and ImageParam are globals shared by all threads; to call
     * jitted code in a thread-safe manner, use compile_to_callable() instead.
     *
     \code
     Param<int32> p(42);
     ImageParam img(Int(32), 1);
     f(x) = img(x) + p;

     Buffer<int32_t) arg_img(10, 10);
     <fill in arg_img...>

     Target t = get_jit_target_from_environment();
     Buffer<int32_t> result = f.realize({10, 10}, t);
     \endcode
     *
     * Alternatively, an initializer list can be used
     * directly in the realize call to pass this information:
     *
     \code
     Param<int32> p(42);
     ImageParam img(Int(32), 1);
     f(x) = img(x) + p;

     Buffer<int32_t) arg_img(10, 10);
     <fill in arg_img...>

     Target t = get_jit_target_from_environment();
     Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
     \endcode
     *
     * If the Func cannot be realized into a buffer of the given size
     * due to scheduling constraints on scattering update definitions,
     * it will be realized into a larger buffer of the minimum size
     * possible, and a cropped view at the requested size will be
     * returned. It is thus not safe to assume the returned buffers
     * are contiguous in memory. This behavior can be disabled with
     * the NoBoundsQuery target flag, in which case an error about
     * writing out of bounds on the output buffer will trigger
     * instead.
     *
     */
    Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target());

    /** Same as above, but takes a custom user-provided context to be
     * passed to runtime functions. This can be used to pass state to
     * runtime overrides in a thread-safe manner. A nullptr context is
     * legal, and is equivalent to calling the variant of realize
     * that does not take a context. */
    Realization realize(JITUserContext *context,
                        std::vector<int32_t> sizes = {},
                        const Target &target = Target());

    /** Evaluate this function into an existing allocated buffer or
     * buffers. If the buffer is also one of the arguments to the
     * function, strange things may happen, as the pipeline isn't
     * necessarily safe to run in-place. If you pass multiple buffers,
     * they must have matching sizes. This form of realize does *not*
     * automatically copy data back from the GPU. */
    void realize(Pipeline::RealizationArg outputs, const Target &target = Target());

    /** Same as above, but takes a custom user-provided context to be
     * passed to runtime functions. This can be used to pass state to
     * runtime overrides in a thread-safe manner. A nullptr context is
     * legal, and is equivalent to calling the variant of realize
     * that does not take a context. */
    void realize(JITUserContext *context,
                 Pipeline::RealizationArg outputs,
                 const Target &target = Target());

    /** For a given size of output, or a given output buffer,
     * determine the bounds required of all unbound ImageParams
     * referenced. Communicates the result by allocating new buffers
     * of the appropriate size and binding them to the unbound
     * ImageParams.
     */
    // @{
    void infer_input_bounds(const std::vector<int32_t> &sizes,
                            const Target &target = get_jit_target_from_environment());
    void infer_input_bounds(Pipeline::RealizationArg outputs,
                            const Target &target = get_jit_target_from_environment());
    // @}

    /** Versions of infer_input_bounds that take a custom user context
     * to pass to runtime functions. */
    // @{
    void infer_input_bounds(JITUserContext *context,
                            const std::vector<int32_t> &sizes,
                            const Target &target = get_jit_target_from_environment());
    void infer_input_bounds(JITUserContext *context,
                            Pipeline::RealizationArg outputs,
                            const Target &target = get_jit_target_from_environment());
    // @}
    /** Statically compile this function to llvm bitcode, with the
     * given filename (which should probably end in .bc), type
     * signature, and C function name (which defaults to the same name
     * as this halide function */
    //@{
    void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                            const Target &target = get_target_from_environment());
    void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
                            const Target &target = get_target_from_environment());
    // @}

    /** Statically compile this function to llvm assembly, with the
     * given filename (which should probably end in .ll), type
     * signature, and C function name (which defaults to the same name
     * as this halide function */
    //@{
    void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                                  const Target &target = get_target_from_environment());
    void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
                                  const Target &target = get_target_from_environment());
    // @}

    /** Statically compile this function to an object file, with the
     * given filename (which should probably end in .o or .obj), type
     * signature, and C function name (which defaults to the same name
     * as this halide function. You probably don't want to use this
     * directly; call compile_to_static_library or compile_to_file instead. */
    //@{
    void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                           const Target &target = get_target_from_environment());
    void compile_to_object(const std::string &filename, const std::vector<Argument> &,
                           const Target &target = get_target_from_environment());
    // @}

    /** Emit a header file with the given filename for this
     * function. The header will define a function with the type
     * signature given by the second argument, and a name given by the
     * third. The name defaults to the same name as this halide
     * function. You don't actually have to have defined this function
     * yet to call this. You probably don't want to use this directly;
     * call compile_to_static_library or compile_to_file instead. */
    void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
                           const Target &target = get_target_from_environment());

    /** Statically compile this function to text assembly equivalent
     * to the object file generated by compile_to_object. This is
     * useful for checking what Halide is producing without having to
     * disassemble anything, or if you need to feed the assembly into
     * some custom toolchain to produce an object file (e.g. iOS) */
    //@{
    void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
                             const Target &target = get_target_from_environment());
    void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
                             const Target &target = get_target_from_environment());
    // @}

    /** Statically compile this function to C source code. This is
     * useful for providing fallback code paths that will compile on
     * many platforms. Vectorization will fail, and parallelization
     * will produce serial code. */
    void compile_to_c(const std::string &filename,
                      const std::vector<Argument> &,
                      const std::string &fn_name = "",
                      const Target &target = get_target_from_environment());

    /** Write out an internal representation of lowered code. Useful
     * for analyzing and debugging scheduling. Can emit html or plain
     * text. */
    void compile_to_lowered_stmt(const std::string &filename,
                                 const std::vector<Argument> &args,
                                 StmtOutputFormat fmt = Text,
                                 const Target &target = get_target_from_environment());

    /** Write out a conceptual representation of lowered code, before any parallel loop
     * get factored out into separate functions, or GPU loops are offloaded to kernel code.r
     * Useful for analyzing and debugging scheduling. Can emit html or plain text. */
    void compile_to_conceptual_stmt(const std::string &filename,
                                    const std::vector<Argument> &args,
                                    StmtOutputFormat fmt = Text,
                                    const Target &target = get_target_from_environment());

    /** Write out the loop nests specified by the schedule for this
     * Function. Helpful for understanding what a schedule is
     * doing. */
    void print_loop_nest();

    /** Compile to object file and header pair, with the given
     * arguments. The name defaults to the same name as this halide
     * function.
     */
    void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
                         const std::string &fn_name = "",
                         const Target &target = get_target_from_environment());

    /** Compile to static-library file and header pair, with the given
     * arguments. The name defaults to the same name as this halide
     * function.
     */
    void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
                                   const std::string &fn_name = "",
                                   const Target &target = get_target_from_environment());

    /** Compile to static-library file and header pair once for each target;
     * each resulting function will be considered (in order) via halide_can_use_target_features()
     * at runtime, with the first appropriate match being selected for subsequent use.
     * This is typically useful for specializations that may vary unpredictably by machine
     * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
     * All targets must have identical arch-os-bits.
     */
    void compile_to_multitarget_static_library(const std::string &filename_prefix,
                                               const std::vector<Argument> &args,
                                               const std::vector<Target> &targets);

    /** Like compile_to_multitarget_static_library(), except that the object files
     * are all output as object files (rather than bundled into a static library).
     *
     * `suffixes` is an optional list of strings to use for as the suffix for each object
     * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
     * will be used for each suffix.)
     *
     * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
     * will be generated with the filename `${filename_prefix}_wrapper.o`
     *
     * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
     * will be generated with the filename `${filename_prefix}_runtime.o`
     */
    void compile_to_multitarget_object_files(const std::string &filename_prefix,
                                             const std::vector<Argument> &args,
                                             const std::vector<Target> &targets,
                                             const std::vector<std::string> &suffixes);

    /** Store an internal representation of lowered code as a self
     * contained Module suitable for further compilation. */
    Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
                             const Target &target = get_target_from_environment());

    /** Compile and generate multiple target files with single call.
     * Deduces target files based on filenames specified in
     * output_files map.
     */
    void compile_to(const std::map<OutputFileType, std::string> &output_files,
                    const std::vector<Argument> &args,
                    const std::string &fn_name,
                    const Target &target = get_target_from_environment());

    /** Eagerly jit compile the function to machine code. This
     * normally happens on the first call to realize. If you're
     * running your halide pipeline inside time-sensitive code and
     * wish to avoid including the time taken to compile a pipeline,
     * then you can call this ahead of time. Default is to use the Target
     * returned from Halide::get_jit_target_from_environment()
     */
    void compile_jit(const Target &target = get_jit_target_from_environment());

    /** Get a struct containing the currently set custom functions
     * used by JIT. This can be mutated. Changes will take effect the
     * next time this Func is realized. */
    JITHandlers &jit_handlers();

    /** Eagerly jit compile the function to machine code and return a callable
     * struct that behaves like a function pointer. The calling convention
     * will exactly match that of an AOT-compiled version of this Func
     * with the same Argument list.
     */
    Callable compile_to_callable(const std::vector<Argument> &args,
                                 const Target &target = get_jit_target_from_environment());

    /** Add a custom pass to be used during lowering. It is run after
     * all other lowering passes. Can be used to verify properties of
     * the lowered Stmt, instrument it with extra code, or otherwise
     * modify it. The Func takes ownership of the pass, and will call
     * delete on it when the Func goes out of scope. So don't pass a
     * stack object, or share pass instances between multiple
     * Funcs. */
    template<typename T>
    void add_custom_lowering_pass(T *pass) {
        // Template instantiate a custom deleter for this type, then
        // wrap in a lambda. The custom deleter lives in user code, so
        // that deletion is on the same heap as construction (I hate Windows).
        add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
    }

    /** Add a custom pass to be used during lowering, with the
     * function that will be called to delete it also passed in. Set
     * it to nullptr if you wish to retain ownership of the object. */
    void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);

    /** Remove all previously-set custom lowering passes */
    void clear_custom_lowering_passes();

    /** Get the custom lowering passes. */
    const std::vector<CustomLoweringPass> &custom_lowering_passes();

    /** When this function is compiled, include code that dumps its
     * values to a file after it is realized, for the purpose of
     * debugging.
     *
     * If filename ends in ".tif" or ".tiff" (case insensitive) the file
     * is in TIFF format and can be read by standard tools. Oherwise, the
     * file format is as follows:
     *
     * All data is in the byte-order of the target platform.  First, a
     * 20 byte-header containing four 32-bit ints, giving the extents
     * of the first four dimensions.  Dimensions beyond four are
     * folded into the fourth.  Then, a fifth 32-bit int giving the
     * data type of the function. The typecodes are given by: float =
     * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
     * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
     * data follows the header, as a densely packed array of the given
     * size and the given type. If given the extension .tmp, this file
     * format can be natively read by the program ImageStack. */
    void debug_to_file(const std::string &filename);

    /** The name of this function, either given during construction,
     * or automatically generated. */
    const std::string &name() const;

    /** Get the pure arguments. */
    std::vector<Var> args() const;

    /** The right-hand-side value of the pure definition of this
     * function. Causes an error if there's no pure definition, or if
     * the function is defined to return multiple values. */
    Expr value() const;

    /** The values returned by this function. An error if the function
     * has not been been defined. Returns a Tuple with one element for
     * functions defined to return a single value. */
    Tuple values() const;

    /** Does this function have at least a pure definition. */
    bool defined() const;

    /** Get the left-hand-side of the update definition. An empty
     * vector if there's no update definition. If there are
     * multiple update definitions for this function, use the
     * argument to select which one you want. */
    const std::vector<Expr> &update_args(int idx = 0) const;

    /** Get the right-hand-side of an update definition. An error if
     * there's no update definition. If there are multiple
     * update definitions for this function, use the argument to
     * select which one you want. */
    Expr update_value(int idx = 0) const;

    /** Get the right-hand-side of an update definition for
     * functions that returns multiple values. An error if there's no
     * update definition. Returns a Tuple with one element for
     * functions that return a single value. */
    Tuple update_values(int idx = 0) const;

    /** Get the RVars of the reduction domain for an update definition, if there is
     * one. */
    std::vector<RVar> rvars(int idx = 0) const;

    /** Does this function have at least one update definition? */
    bool has_update_definition() const;

    /** How many update definitions does this function have? */
    int num_update_definitions() const;

    /** Is this function an external stage? That is, was it defined
     * using define_extern? */
    bool is_extern() const;

    /** Add an extern definition for this Func. This lets you define a
     * Func that represents an external pipeline stage. You can, for
     * example, use it to wrap a call to an extern library such as
     * fftw. */
    // @{
    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params, Type t,
                       int dimensionality,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host) {
        define_extern(function_name, params, t,
                      Internal::make_argument_list(dimensionality), mangling,
                      device_api);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params,
                       const std::vector<Type> &types, int dimensionality,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host) {
        define_extern(function_name, params, types,
                      Internal::make_argument_list(dimensionality), mangling,
                      device_api);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params, Type t,
                       const std::vector<Var> &arguments,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host) {
        define_extern(function_name, params, std::vector<Type>{t}, arguments,
                      mangling, device_api);
    }

    void define_extern(const std::string &function_name,
                       const std::vector<ExternFuncArgument> &params,
                       const std::vector<Type> &types,
                       const std::vector<Var> &arguments,
                       NameMangling mangling = NameMangling::Default,
                       DeviceAPI device_api = DeviceAPI::Host);
    // @}

    /** Get the type(s) of the outputs of this Func.
     *
     * It is not legal to call type() unless the Func has non-Tuple elements.
     *
     * If the Func isn't yet defined, and was not specified with required types,
     * a runtime error will occur.
     *
     * If the Func isn't yet defined, but *was* specified with required types,
     * the requirements will be returned. */
    // @{
    const Type &type() const;
    const std::vector<Type> &types() const;
    // @}

    /** Get the number of outputs of this Func. Corresponds to the
     * size of the Tuple this Func was defined to return.
     * If the Func isn't yet defined, but was specified with required types,
     * the number of outputs specified in the requirements will be returned. */
    int outputs() const;

    /** Get the name of the extern function called for an extern
     * definition. */
    const std::string &extern_function_name() const;

    /** The dimensionality (number of arguments) of this function.
     * If the Func isn't yet defined, but was specified with required dimensionality,
     * the dimensionality specified in the requirements will be returned. */
    int dimensions() const;

    /** Construct either the left-hand-side of a definition, or a call
     * to a functions that happens to only contain vars as
     * arguments. If the function has already been defined, and fewer
     * arguments are given than the function has dimensions, then
     * enough implicit vars are added to the end of the argument list
     * to make up the difference (see \ref Var::implicit) */
    // @{
    FuncRef operator()(std::vector<Var>) const;

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<Var, Args...>::value, FuncRef>
    operator()(Args &&...args) const {
        std::vector<Var> collected_args{std::forward<Args>(args)...};
        return this->operator()(collected_args);
    }
    // @}

    /** Either calls to the function, or the left-hand-side of
     * an update definition (see \ref RDom). If the function has
     * already been defined, and fewer arguments are given than the
     * function has dimensions, then enough implicit vars are added to
     * the end of the argument list to make up the difference. (see
     * \ref Var::implicit)*/
    // @{
    FuncRef operator()(std::vector<Expr>) const;

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>
    operator()(const Expr &x, Args &&...args) const {
        std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
        return (*this)(collected_args);
    }
    // @}

    /** Creates and returns a new identity Func that wraps this Func. During
     * compilation, Halide replaces all calls to this Func done by 'f'
     * with calls to the wrapper. If this Func is already wrapped for
     * use in 'f', will return the existing wrapper.
     *
     * For example, g.in(f) would rewrite a pipeline like this:
     \code
     g(x, y) = ...
     f(x, y) = ... g(x, y) ...
     \endcode
     * into a pipeline like this:
     \code
     g(x, y) = ...
     g_wrap(x, y) = g(x, y)
     f(x, y) = ... g_wrap(x, y)
     \endcode
     *
     * This has a variety of uses. You can use it to schedule this
     * Func differently in the different places it is used:
     \code
     g(x, y) = ...
     f1(x, y) = ... g(x, y) ...
     f2(x, y) = ... g(x, y) ...
     g.in(f1).compute_at(f1, y).vectorize(x, 8);
     g.in(f2).compute_at(f2, x).unroll(x);
     \endcode
     *
     * You can also use it to stage loads from this Func via some
     * intermediate buffer (perhaps on the stack as in
     * test/performance/block_transpose.cpp, or in shared GPU memory
     * as in test/performance/wrap.cpp). In this we compute the
     * wrapper at tiles of the consuming Funcs like so:
     \code
     g.compute_root()...
     g.in(f).compute_at(f, tiles)...
     \endcode
     *
     * Func::in() can also be used to compute pieces of a Func into a
     * smaller scratch buffer (perhaps on the GPU) and then copy them
     * into a larger output buffer one tile at a time. See
     * apps/interpolate/interpolate.cpp for an example of this. In
     * this case we compute the Func at tiles of its own wrapper:
     \code
     f.in(g).compute_root().gpu_tile(...)...
     f.compute_at(f.in(g), tiles)...
     \endcode
     *
     * A similar use of Func::in() wrapping Funcs with multiple update
     * stages in a pure wrapper. The following code:
     \code
     f(x, y) = x + y;
     f(x, y) += 5;
     g(x, y) = f(x, y);
     f.compute_root();
     \endcode
     *
     * Is equivalent to:
     \code
     for y:
       for x:
         f(x, y) = x + y;
     for y:
       for x:
         f(x, y) += 5
     for y:
       for x:
         g(x, y) = f(x, y)
     \endcode
     * using Func::in(), we can write:
     \code
     f(x, y) = x + y;
     f(x, y) += 5;
     g(x, y) = f(x, y);
     f.in(g).compute_root();
     \endcode
     * which instead produces:
     \code
     for y:
       for x:
         f(x, y) = x + y;
         f(x, y) += 5
         f_wrap(x, y) = f(x, y)
     for y:
       for x:
         g(x, y) = f_wrap(x, y)
     \endcode
     */
    Func in(const Func &f);

    /** Create and return an identity wrapper shared by all the Funcs in
     * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
     * this will throw an error. */
    Func in(const std::vector<Func> &fs);

    /** Create and return a global identity wrapper, which wraps all calls to
     * this Func by any other Func. If a global wrapper already exists,
     * returns it. The global identity wrapper is only used by callers for
     * which no custom wrapper has been specified.
     */
    Func in();

    /** Similar to \ref Func::in; however, instead of replacing the call to
     * this Func with an identity Func that refers to it, this replaces the
     * call with a clone of this Func.
     *
     * For example, f.clone_in(g) would rewrite a pipeline like this:
     \code
     f(x, y) = x + y;
     g(x, y) = f(x, y) + 2;
     h(x, y) = f(x, y) - 3;
     \endcode
     * into a pipeline like this:
     \code
     f(x, y) = x + y;
     f_clone(x, y) = x + y;
     g(x, y) = f_clone(x, y) + 2;
     h(x, y) = f(x, y) - 3;
     \endcode
     *
     */
    //@{
    Func clone_in(const Func &f);
    Func clone_in(const std::vector<Func> &fs);
    //@}

    /** Declare that this function should be implemented by a call to
     * halide_buffer_copy with the given target device API. Asserts
     * that the Func has a pure definition which is a simple call to a
     * single input, and no update definitions. The wrapper Funcs
     * returned by in() are suitable candidates. Consumes all pure
     * variables, and rewrites the Func to have an extern definition
     * that calls halide_buffer_copy. */
    Func copy_to_device(DeviceAPI d = DeviceAPI::Default_GPU);

    /** Declare that this function should be implemented by a call to
     * halide_buffer_copy with a NULL target device API. Equivalent to
     * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
     * pure definition which is a simple call to a single input, and
     * no update definitions. The wrapper Funcs returned by in() are
     * suitable candidates. Consumes all pure variables, and rewrites
     * the Func to have an extern definition that calls
     * halide_buffer_copy.
     *
     * Note that if the source Func is already valid in host memory,
     * this compiles to code that does the minimum number of calls to
     * memcpy.
     */
    Func copy_to_host();

    /** Split a dimension into inner and outer subdimensions with the
     * given names, where the inner dimension iterates from 0 to
     * factor-1. The inner and outer subdimensions can then be dealt
     * with using the other scheduling calls. It's ok to reuse the old
     * variable name as either the inner or outer variable. The final
     * argument specifies how the tail should be handled if the split
     * factor does not provably divide the extent. */
    Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);

    /** Join two dimensions into a single fused dimension. The fused dimension
     * covers the product of the extents of the inner and outer dimensions
     * given. The loop type (e.g. parallel, vectorized) of the resulting fused
     * dimension is inherited from the first argument. */
    Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);

    /** Mark a dimension to be traversed serially. This is the default. */
    Func &serial(const VarOrRVar &var);

    /** Mark a dimension to be traversed in parallel */
    Func &parallel(const VarOrRVar &var);

    /** Split a dimension by the given task_size, and the parallelize the
     * outer dimension. This creates parallel tasks that have size
     * task_size. After this call, var refers to the outer dimension of
     * the split. The inner dimension has a new anonymous name. If you
     * wish to mutate it, or schedule with respect to it, do the split
     * manually. */
    Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);

    /** Mark a dimension to be computed all-at-once as a single
     * vector. The dimension should have constant extent -
     * e.g. because it is the inner dimension following a split by a
     * constant factor. For most uses of vectorize you want the two
     * argument form. The variable to be vectorized should be the
     * innermost one. */
    Func &vectorize(const VarOrRVar &var);

    /** Mark a dimension to be completely unrolled. The dimension
     * should have constant extent - e.g. because it is the inner
     * dimension following a split by a constant factor. For most uses
     * of unroll you want the two-argument form. */
    Func &unroll(const VarOrRVar &var);

    /** Split a dimension by the given factor, then vectorize the
     * inner dimension. This is how you vectorize a loop of unknown
     * size. The variable to be vectorized should be the innermost
     * one. After this call, var refers to the outer dimension of the
     * split. 'factor' must be an integer. */
    Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);

    /** Split a dimension by the given factor, then unroll the inner
     * dimension. This is how you unroll a loop of unknown size by
     * some constant factor. After this call, var refers to the outer
     * dimension of the split. 'factor' must be an integer. */
    Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);

    /** Set the loop partition policy. Loop partitioning can be useful to
     * optimize boundary conditions (such as clamp_edge). Loop partitioning
     * splits a for loop into three for loops: a prologue, a steady-state,
     * and an epilogue.
     * The default policy is Auto. */
    Func &partition(const VarOrRVar &var, Partition partition_policy);

    /** Set the loop partition policy to Never for a vector of Vars and
     * RVars. */
    Func &never_partition(const std::vector<VarOrRVar> &vars);

    /** Set the loop partition policy to Never for some number of Vars and RVars. */
    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>
    never_partition(const VarOrRVar &x, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
        return never_partition(collected_args);
    }

    /** Set the loop partition policy to Never for all Vars and RVar of the
     * initial definition of the Func. It must be called separately on any
     * update definitions. */
    Func &never_partition_all();

    /** Set the loop partition policy to Always for a vector of Vars and
     * RVars. */
    Func &always_partition(const std::vector<VarOrRVar> &vars);

    /** Set the loop partition policy to Always for some number of Vars and RVars. */
    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>
    always_partition(const VarOrRVar &x, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
        return always_partition(collected_args);
    }

    /** Set the loop partition policy to Always for all Vars and RVar of the
     * initial definition of the Func. It must be called separately on any
     * update definitions. */
    Func &always_partition_all();

    /** Statically declare that the range over which a function should
     * be evaluated is given by the second and third arguments. This
     * can let Halide perform some optimizations. E.g. if you know
     * there are going to be 4 color channels, you can completely
     * vectorize the color channel dimension without the overhead of
     * splitting it up. If bounds inference decides that it requires
     * more of this function than the bounds you have stated, a
     * runtime error will occur when you try to run your pipeline. */
    Func &bound(const Var &var, Expr min, Expr extent);

    /** Statically declare the range over which the function will be
     * evaluated in the general case. This provides a basis for the auto
     * scheduler to make trade-offs and scheduling decisions. The auto
     * generated schedules might break when the sizes of the dimensions are
     * very different from the estimates specified. These estimates are used
     * only by the auto scheduler if the function is a pipeline output. */
    Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);

    /** Set (min, extent) estimates for all dimensions in the Func
     * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
     * repeatedly, but slightly terser. The size of the estimates vector
     * must match the dimensionality of the Func. */
    Func &set_estimates(const Region &estimates);

    /** Expand the region computed so that the min coordinates is
     * congruent to 'remainder' modulo 'modulus', and the extent is a
     * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
     * the min and extent realized to be even, and calling
     * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
     * to be even. The region computed always contains the region that
     * would have been computed without this directive, so no
     * assertions are injected.
     */
    Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);

    /** Expand the region computed so that the extent is a
     * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
     * the extent realized to be even. The region computed always contains the
     * region that would have been computed without this directive, so no
     * assertions are injected. (This is essentially equivalent to align_bounds(),
     * but always leaving the min untouched.)
     */
    Func &align_extent(const Var &var, Expr modulus);

    /** Bound the extent of a Func's realization, but not its
     * min. This means the dimension can be unrolled or vectorized
     * even when its min is not fixed (for example because it is
     * compute_at tiles of another Func). This can also be useful for
     * forcing a function's allocation to be a fixed size, which often
     * means it can go on the stack. */
    Func &bound_extent(const Var &var, Expr extent);

    /** Split two dimensions at once by the given factors, and then
     * reorder the resulting dimensions to be xi, yi, xo, yo from
     * innermost outwards. This gives a tiled traversal. */
    Func &tile(const VarOrRVar &x, const VarOrRVar &y,
               const VarOrRVar &xo, const VarOrRVar &yo,
               const VarOrRVar &xi, const VarOrRVar &yi,
               const Expr &xfactor, const Expr &yfactor,
               TailStrategy tail = TailStrategy::Auto);

    /** A shorter form of tile, which reuses the old variable names as
     * the new outer dimensions */
    Func &tile(const VarOrRVar &x, const VarOrRVar &y,
               const VarOrRVar &xi, const VarOrRVar &yi,
               const Expr &xfactor, const Expr &yfactor,
               TailStrategy tail = TailStrategy::Auto);

    /** A more general form of tile, which defines tiles of any dimensionality. */
    Func &tile(const std::vector<VarOrRVar> &previous,
               const std::vector<VarOrRVar> &outers,
               const std::vector<VarOrRVar> &inners,
               const std::vector<Expr> &factors,
               const std::vector<TailStrategy> &tails);

    /** The generalized tile, with a single tail strategy to apply to all vars. */
    Func &tile(const std::vector<VarOrRVar> &previous,
               const std::vector<VarOrRVar> &outers,
               const std::vector<VarOrRVar> &inners,
               const std::vector<Expr> &factors,
               TailStrategy tail = TailStrategy::Auto);

    /** Generalized tiling, reusing the previous names as the outer names. */
    Func &tile(const std::vector<VarOrRVar> &previous,
               const std::vector<VarOrRVar> &inners,
               const std::vector<Expr> &factors,
               TailStrategy tail = TailStrategy::Auto);

    /** Reorder variables to have the given nesting order, from
     * innermost out */
    Func &reorder(const std::vector<VarOrRVar> &vars);

    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>
    reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
        std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
        return reorder(collected_args);
    }

    /** Get the Vars of the pure definition, with splits applied. This
     * represents all the potentially-valid compute_at sites for this stage of
     * this Func. Note that this, by default, will end with the sentinel
     * Var::outermost. */
    std::vector<Var> split_vars() const;

    /** Rename a dimension. Equivalent to split with a inner size of one. */
    Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);

    /** Specify that race conditions are permitted for this Func,
     * which enables parallelizing over RVars even when Halide cannot
     * prove that it is safe to do so. Use this with great caution,
     * and only if you can prove to yourself that this is safe, as it
     * may result in a non-deterministic routine that returns
     * different values at different times or on different machines. */
    Func &allow_race_conditions();

    /** Issue atomic updates for this Func. This allows parallelization
     * on associative RVars. The function throws a compile error when
     * Halide fails to prove associativity. Use override_associativity_test
     * to disable the associativity test if you believe the function is
     * associative or the order of reduction variable execution does not
     * matter.
     * Halide compiles this into hardware atomic operations whenever possible,
     * and falls back to a mutex lock per storage element if it is impossible
     * to atomically update.
     * There are three possible outcomes of the compiled code:
     * atomic add, compare-and-swap loop, and mutex lock.
     * For example:
     *
     * hist(x) = 0;
     * hist(im(r)) += 1;
     * hist.compute_root();
     * hist.update().atomic().parallel();
     *
     * will be compiled to atomic add operations.
     *
     * hist(x) = 0;
     * hist(im(r)) = min(hist(im(r)) + 1, 100);
     * hist.compute_root();
     * hist.update().atomic().parallel();
     *
     * will be compiled to compare-and-swap loops.
     *
     * arg_max() = {0, im(0)};
     * Expr old_index = arg_max()[0];
     * Expr old_max   = arg_max()[1];
     * Expr new_index = select(old_max < im(r), r, old_index);
     * Expr new_max   = max(im(r), old_max);
     * arg_max() = {new_index, new_max};
     * arg_max.compute_root();
     * arg_max.update().atomic().parallel();
     *
     * will be compiled to updates guarded by a mutex lock,
     * since it is impossible to atomically update two different locations.
     *
     * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
     * Compiling to other backends results in a compile error.
     * If an operation is compiled into a mutex lock, and is vectorized or is
     * compiled to CUDA or OpenCL, it also results in a compile error,
     * since per-element mutex lock on vectorized operation leads to a
     * deadlock.
     * Vectorization of predicated RVars (through rdom.where()) on CPU
     * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
     * 8-bit and 16-bit atomics on GPU are also not supported. */
    Func &atomic(bool override_associativity_test = false);

    /** Specialize a Func. This creates a special-case version of the
     * Func where the given condition is true. The most effective
     * conditions are those of the form param == value, and boolean
     * Params. Consider a simple example:
     \code
     f(x) = x + select(cond, 0, 1);
     f.compute_root();
     \endcode
     * This is equivalent to:
     \code
     for (int x = 0; x < width; x++) {
       f[x] = x + (cond ? 0 : 1);
     }
     \endcode
     * Adding the scheduling directive:
     \code
     f.specialize(cond)
     \endcode
     * makes it equivalent to:
     \code
     if (cond) {
       for (int x = 0; x < width; x++) {
         f[x] = x;
       }
     } else {
       for (int x = 0; x < width; x++) {
         f[x] = x + 1;
       }
     }
     \endcode
     * Note that the inner loops have been simplified. In the first
     * path Halide knows that cond is true, and in the second path
     * Halide knows that it is false.
     *
     * The specialized version gets its own schedule, which inherits
     * every directive made about the parent Func's schedule so far
     * except for its specializations. This method returns a handle to
     * the new schedule. If you wish to retrieve the specialized
     * sub-schedule again later, you can call this method with the
     * same condition. Consider the following example of scheduling
     * the specialized version:
     *
     \code
     f(x) = x;
     f.compute_root();
     f.specialize(width > 1).unroll(x, 2);
     \endcode
     * Assuming for simplicity that width is even, this is equivalent to:
     \code
     if (width > 1) {
       for (int x = 0; x < width/2; x++) {
         f[2*x] = 2*x;
         f[2*x + 1] = 2*x + 1;
       }
     } else {
       for (int x = 0; x < width/2; x++) {
         f[x] = x;
       }
     }
     \endcode
     * For this case, it may be better to schedule the un-specialized
     * case instead:
     \code
     f(x) = x;
     f.compute_root();
     f.specialize(width == 1); // Creates a copy of the schedule so far.
     f.unroll(x, 2); // Only applies to the unspecialized case.
     \endcode
     * This is equivalent to:
     \code
     if (width == 1) {
       f[0] = 0;
     } else {
       for (int x = 0; x < width/2; x++) {
         f[2*x] = 2*x;
         f[2*x + 1] = 2*x + 1;
       }
     }
     \endcode
     * This can be a good way to write a pipeline that splits,
     * vectorizes, or tiles, but can still handle small inputs.
     *
     * If a Func has several specializations, the first matching one
     * will be used, so the order in which you define specializations
     * is significant. For example:
     *
     \code
     f(x) = x + select(cond1, a, b) - select(cond2, c, d);
     f.specialize(cond1);
     f.specialize(cond2);
     \endcode
     * is equivalent to:
     \code
     if (cond1) {
       for (int x = 0; x < width; x++) {
         f[x] = x + a - (cond2 ? c : d);
       }
     } else if (cond2) {
       for (int x = 0; x < width; x++) {
         f[x] = x + b - c;
       }
     } else {
       for (int x = 0; x < width; x++) {
         f[x] = x + b - d;
       }
     }
     \endcode
     *
     * Specializations may in turn be specialized, which creates a
     * nested if statement in the generated code.
     *
     \code
     f(x) = x + select(cond1, a, b) - select(cond2, c, d);
     f.specialize(cond1).specialize(cond2);
     \endcode
     * This is equivalent to:
     \code
     if (cond1) {
       if (cond2) {
         for (int x = 0; x < width; x++) {
           f[x] = x + a - c;
         }
       } else {
         for (int x = 0; x < width; x++) {
           f[x] = x + a - d;
         }
       }
     } else {
       for (int x = 0; x < width; x++) {
         f[x] = x + b - (cond2 ? c : d);
       }
     }
     \endcode
     * To create a 4-way if statement that simplifies away all of the
     * ternary operators above, you could say:
     \code
     f.specialize(cond1).specialize(cond2);
     f.specialize(cond2);
     \endcode
     * or
     \code
     f.specialize(cond1 && cond2);
     f.specialize(cond1);
     f.specialize(cond2);
     \endcode
     *
     * Any prior Func which is compute_at some variable of this Func
     * gets separately included in all paths of the generated if
     * statement. The Var in the compute_at call to must exist in all
     * paths, but it may have been generated via a different path of
     * splits, fuses, and renames. This can be used somewhat
     * creatively. Consider the following code:
     \code
     g(x, y) = 8*x;
     f(x, y) = g(x, y) + 1;
     f.compute_root().specialize(cond);
     Var g_loop;
     f.specialize(cond).rename(y, g_loop);
     f.rename(x, g_loop);
     g.compute_at(f, g_loop);
     \endcode
     * When cond is true, this is equivalent to g.compute_at(f,y).
     * When it is false, this is equivalent to g.compute_at(f,x).
     */
    Stage specialize(const Expr &condition);

    /** Add a specialization to a Func that always terminates execution
     * with a call to halide_error(). By itself, this is of limited use,
     * but can be useful to terminate chains of specialize() calls where
     * no "default" case is expected (thus avoiding unnecessary code generation).
     *
     * For instance, say we want to optimize a pipeline to process images
     * in planar and interleaved format; we might typically do something like:
     \code
     ImageParam im(UInt(8), 3);
     Func f = do_something_with(im);
     f.specialize(im.dim(0).stride() == 1).vectorize(x, 8);  // planar
     f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c);  // interleaved
     \endcode
     * This code will vectorize along rows for the planar case, and across pixel
     * components for the interleaved case... but there is an implicit "else"
     * for the unhandled cases, which generates unoptimized code. If we never
     * anticipate passing any other sort of images to this, we code streamline
     * our code by adding specialize_fail():
     \code
     ImageParam im(UInt(8), 3);
     Func f = do_something(im);
     f.specialize(im.dim(0).stride() == 1).vectorize(x, 8);  // planar
     f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c);  // interleaved
     f.specialize_fail("Unhandled image format");
     \endcode
     * Conceptually, this produces codes like:
     \code
     if (im.dim(0).stride() == 1) {
        do_something_planar();
     } else if (im.dim(2).stride() == 1) {
        do_something_interleaved();
     } else {
        halide_error("Unhandled image format");
     }
     \endcode
     *
     * Note that calling specialize_fail() terminates the specialization chain
     * for a given Func; you cannot create new specializations for the Func
     * afterwards (though you can retrieve handles to previous specializations).
     */
    void specialize_fail(const std::string &message);

    /** Tell Halide that the following dimensions correspond to GPU
     * thread indices. This is useful if you compute a producer
     * function within the block indices of a consumer function, and
     * want to control how that function's dimensions map to GPU
     * threads. If the selected target is not an appropriate GPU, this
     * just marks those dimensions as parallel. */
    // @{
    Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** The given dimension corresponds to the lanes in a GPU
     * warp. GPU warp lanes are distinguished from GPU threads by the
     * fact that all warp lanes run together in lockstep, which
     * permits lightweight communication of data from one lane to
     * another. */
    Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);

    /** Tell Halide to run this stage using a single gpu thread and
     * block. This is not an efficient use of your GPU, but it can be
     * useful to avoid copy-back for intermediate update stages that
     * touch a very small part of your Func. */
    Func &gpu_single_thread(DeviceAPI device_api = DeviceAPI::Default_GPU);

    /** Tell Halide that the following dimensions correspond to GPU
     * block indices. This is useful for scheduling stages that will
     * run serially within each GPU block. If the selected target is
     * not ptx, this just marks those dimensions as parallel. */
    // @{
    Func &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** Tell Halide that the following dimensions correspond to GPU
     * block indices and thread indices. If the selected target is not
     * ptx, these just mark the given dimensions as parallel. The
     * dimensions are consumed by this call, so do all other
     * unrolling, reordering, etc first. */
    // @{
    Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
              const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
              const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** Short-hand for tiling a domain and mapping the tile indices
     * to GPU block indices and the coordinates within each tile to
     * GPU thread indices. Consumes the variables given, so do all
     * other scheduling first. */
    // @{
    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);

    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                   const VarOrRVar &bx, const VarOrRVar &by,
                   const VarOrRVar &tx, const VarOrRVar &ty,
                   const Expr &x_size, const Expr &y_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);

    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
                   const VarOrRVar &tx, const VarOrRVar &ty,
                   const Expr &x_size, const Expr &y_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);

    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                   const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
                   const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                   const Expr &x_size, const Expr &y_size, const Expr &z_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);
    Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
                   const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
                   const Expr &x_size, const Expr &y_size, const Expr &z_size,
                   TailStrategy tail = TailStrategy::Auto,
                   DeviceAPI device_api = DeviceAPI::Default_GPU);
    // @}

    /** Schedule for execution on Hexagon. When a loop is marked with
     * Hexagon, that loop is executed on a Hexagon DSP. */
    Func &hexagon(const VarOrRVar &x = Var::outermost());

    /** Prefetch data written to or read from a Func or an ImageParam by a
     * subsequent loop iteration, at an optionally specified iteration offset. You may specify
     * specification of different vars for the location of the prefetch() instruction
     * vs. the location that is being prefetched:
     *
     * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
     * - the second var specified, 'from', determines the var used to find the bounds to prefetch
     *   (in conjunction with 'offset')
     *
     * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
     * Note that the value for 'offset' applies only to 'from', not 'at'.
     *
     * The final argument specifies how prefetch of region outside bounds
     * should be handled.
     *
     * For example, consider this pipeline:
     \code
     Func f, g;
     Var x, y, z;
     f(x, y) = x + y;
     g(x, y) = 2 * f(x, y);
     h(x, y) = 3 * f(x, y);
     \endcode
     *
     * The following schedule:
     \code
     f.compute_root();
     g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
     h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
     \endcode
     *
     * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
     * the following loop nest:
     \code
     for y = ...
       for x = ...
         f(x, y) = x + y
     for y = ..
       for x = ...
         prefetch(&f[x + 2, y], 1, 16);
         g(x, y) = 2 * f(x, y)
     for y = ..
       for x = ...
         prefetch(&f[x, y + 2], 1, 16);
         h(x, y) = 3 * f(x, y)
     \endcode
     *
     * Note that the 'from' nesting level need not be adjacent to 'at':
     \code
     Func f, g;
     Var x, y, z, w;
     f(x, y, z, w) = x + y + z + w;
     g(x, y, z, w) = 2 * f(x, y, z, w);
     \endcode
     *
     * The following schedule:
     \code
     f.compute_root();
     g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
     \endcode
     *
     * will produce code that prefetches a tile of data:
     \code
     for w = ...
       for z = ...
         for y = ...
           for x = ...
         f(x, y, z, w) = x + y + z + w
     for w = ...
       for z = ...
         for y = ...
           for x0 = ...
              prefetch(&f[x0, y, z, w + 2], 1, 16);
           for x = ...
             g(x, y, z, w) = 2 * f(x, y, z, w)
     \endcode
     *
     * Note that calling prefetch() with the same var for both 'at' and 'from'
     * is equivalent to calling prefetch() with that var.
     */
    // @{
    Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                   PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    Func &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                   PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf);
    template<typename T>
    Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
                   PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
        return prefetch(image.parameter(), at, from, std::move(offset), strategy);
    }
    // @}

    /** Specify how the storage for the function is laid out. These
     * calls let you specify the nesting order of the dimensions. For
     * example, foo.reorder_storage(y, x) tells Halide to use
     * column-major storage for any realizations of foo, without
     * changing how you refer to foo in the code. You may want to do
     * this if you intend to vectorize across y. When representing
     * color images, foo.reorder_storage(c, x, y) specifies packed
     * storage (red, green, and blue values adjacent in memory), and
     * foo.reorder_storage(x, y, c) specifies planar storage (entire
     * red, green, and blue images one after the other in memory).
     *
     * If you leave out some dimensions, those remain in the same
     * positions in the nesting order while the specified variables
     * are reordered around them. */
    // @{
    Func &reorder_storage(const std::vector<Var> &dims);

    Func &reorder_storage(const Var &x, const Var &y);
    template<typename... Args>
    HALIDE_NO_USER_CODE_INLINE std::enable_if_t<Internal::all_are_convertible<Var, Args...>::value, Func &>
    reorder_storage(const Var &x, const Var &y, Args &&...args) {
        std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
        return reorder_storage(collected_args);
    }
    // @}

    /** Pad the storage extent of a particular dimension of
     * realizations of this function up to be a multiple of the
     * specified alignment. This guarantees that the strides for the
     * dimensions stored outside of dim will be multiples of the
     * specified alignment, where the strides and alignment are
     * measured in numbers of elements.
     *
     * For example, to guarantee that a function foo(x, y, c)
     * representing an image has scanlines starting on offsets
     * aligned to multiples of 16, use foo.align_storage(x, 16). */
    Func &align_storage(const Var &dim, const Expr &alignment);

    /** Store realizations of this function in a circular buffer of a
     * given extent. This is more efficient when the extent of the
     * circular buffer is a power of 2. If the fold factor is too
     * small, or the dimension is not accessed monotonically, the
     * pipeline will generate an error at runtime.
     *
     * The fold_forward option indicates that the new values of the
     * producer are accessed by the consumer in a monotonically
     * increasing order. Folding storage of producers is also
     * supported if the new values are accessed in a monotonically
     * decreasing order by setting fold_forward to false.
     *
     * For example, consider the pipeline:
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1);
     \endcode
     *
     * If we schedule f like so:
     *
     \code
     g.compute_at(f, y).store_root().fold_storage(y, 2);
     \endcode
     *
     * Then g will be computed at each row of f and stored in a buffer
     * with an extent in y of 2, alternately storing each computed row
     * of g in row y=0 or y=1.
     */
    Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);

    /** Compute this function as needed for each unique value of the
     * given var for the given calling function f.
     *
     * For example, consider the simple pipeline:
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
     \endcode
     *
     * If we schedule f like so:
     *
     \code
     g.compute_at(f, x);
     \endcode
     *
     * Then the C code equivalent to this pipeline will look like this
     *
     \code

     int f[height][width];
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             int g[2][2];
             g[0][0] = x*y;
             g[0][1] = (x+1)*y;
             g[1][0] = x*(y+1);
             g[1][1] = (x+1)*(y+1);
             f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
         }
     }

     \endcode
     *
     * The allocation and computation of g is within f's loop over x,
     * and enough of g is computed to satisfy all that f will need for
     * that iteration. This has excellent locality - values of g are
     * used as soon as they are computed, but it does redundant
     * work. Each value of g ends up getting computed four times. If
     * we instead schedule f like so:
     *
     \code
     g.compute_at(f, y);
     \endcode
     *
     * The equivalent C code is:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][width+1];
         for (int x = 0; x < width; x++) {
             g[0][x] = x*y;
             g[1][x] = x*(y+1);
         }
         for (int x = 0; x < width; x++) {
             f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
         }
     }
     \endcode
     *
     * The allocation and computation of g is within f's loop over y,
     * and enough of g is computed to satisfy all that f will need for
     * that iteration. This does less redundant work (each point in g
     * ends up being evaluated twice), but the locality is not quite
     * as good, and we have to allocate more temporary memory to store
     * g.
     */
    Func &compute_at(const Func &f, const Var &var);

    /** Schedule a function to be computed within the iteration over
     * some dimension of an update domain. Produces equivalent code
     * to the version of compute_at that takes a Var. */
    Func &compute_at(const Func &f, const RVar &var);

    /** Schedule a function to be computed within the iteration over
     * a given LoopLevel. */
    Func &compute_at(LoopLevel loop_level);

    /** Schedule the iteration over the initial definition of this function
     *  to be fused with another stage 's' from outermost loop to a
     * given LoopLevel. */
    // @{
    Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Func &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
    Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
    Func &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);

    /** Compute all of this function once ahead of time. Reusing
     * the example in \ref Func::compute_at :
     *
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);

     g.compute_root();
     \endcode
     *
     * is equivalent to
     *
     \code
     int f[height][width];
     int g[height+1][width+1];
     for (int y = 0; y < height+1; y++) {
         for (int x = 0; x < width+1; x++) {
             g[y][x] = x*y;
         }
     }
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
         }
     }
     \endcode
     *
     * g is computed once ahead of time, and enough is computed to
     * satisfy all uses of it. This does no redundant work (each point
     * in g is evaluated once), but has poor locality (values of g are
     * probably not still in cache when they are used by f), and
     * allocates lots of temporary memory to store g.
     */
    Func &compute_root();

    /** Use the halide_memoization_cache_... interface to store a
     *  computed version of this function across invocations of the
     *  Func.
     *
     * If an eviction_key is provided, it must be constructed with
     * Expr of integer or handle type. The key Expr will be promoted
     * to a uint64_t and can be used with halide_memoization_cache_evict
     * to remove memoized entries using this eviction key from the
     * cache. Memoized computations that do not provide an eviction
     * key will never be evicted by this mechanism.
     *
     * It is invalid to memoize the output of a Pipeline; attempting
     * to do so will issue an error. To cache an entire pipeline,
     * either implement a caching mechanism outside of Halide or
     * explicitly copy out of the cache with another output Func.
     */
    Func &memoize(const EvictionKey &eviction_key = EvictionKey());

    /** Produce this Func asynchronously in a separate
     * thread. Consumers will be run by the task system when the
     * production is complete. If this Func's store level is different
     * to its compute level, consumers will be run concurrently,
     * blocking as necessary to prevent reading ahead of what the
     * producer has computed. If storage is folded, then the producer
     * will additionally not be permitted to run too far ahead of the
     * consumer, to avoid clobbering data that has not yet been
     * used.
     *
     * Take special care when combining this with custom thread pool
     * implementations, as avoiding deadlock with producer-consumer
     * parallelism requires a much more sophisticated parallel runtime
     * than with data parallelism alone. It is strongly recommended
     * you just use Halide's default thread pool, which guarantees no
     * deadlock and a bound on the number of threads launched.
     */
    Func &async();

    /** Expands the storage of the function by an extra dimension
     * to enable ring buffering. For this to be useful the storage
     * of the function has to be hoisted to an upper loop level using
     * \ref Func::hoist_storage. The index for the new ring buffer dimension
     * is calculated implicitly based on a linear combination of the all of
     * the loop variables between hoist_storage and compute_at/store_at
     * loop levels. Scheduling a function with ring_buffer increases the
     * amount of memory required for this function by an *extent* times.
     * ring_buffer is especially useful in combination with \ref Func::async,
     * but can be used without it.
     *
     * The extent is expected to be a positive integer.
     */
    Func &ring_buffer(Expr extent);

    /** Bound the extent of a Func's storage, but not extent of its
     * compute. This can be useful for forcing a function's allocation
     * to be a fixed size, which often means it can go on the stack.
     * If bounds inference decides that it requires more storage for
     * this function than the allocation size you have stated, a runtime
     * error will occur when you try to run the pipeline. */
    Func &bound_storage(const Var &dim, const Expr &bound);

    /** Allocate storage for this function within f's loop over
     * var. Scheduling storage is optional, and can be used to
     * separate the loop level at which storage occurs from the loop
     * level at which computation occurs to trade off between locality
     * and redundant work. This can open the door for two types of
     * optimization.
     *
     * Consider again the pipeline from \ref Func::compute_at :
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
     \endcode
     *
     * If we schedule it like so:
     *
     \code
     g.compute_at(f, x).store_at(f, y);
     \endcode
     *
     * Then the computation of g takes place within the loop over x,
     * but the storage takes place within the loop over y:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][width+1];
         for (int x = 0; x < width; x++) {
             g[0][x] = x*y;
             g[0][x+1] = (x+1)*y;
             g[1][x] = x*(y+1);
             g[1][x+1] = (x+1)*(y+1);
             f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
         }
     }
     \endcode
     *
     * Provided the for loop over x is serial, halide then
     * automatically performs the following sliding window
     * optimization:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][width+1];
         for (int x = 0; x < width; x++) {
             if (x == 0) {
                 g[0][x] = x*y;
                 g[1][x] = x*(y+1);
             }
             g[0][x+1] = (x+1)*y;
             g[1][x+1] = (x+1)*(y+1);
             f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
         }
     }
     \endcode
     *
     * Two of the assignments to g only need to be done when x is
     * zero. The rest of the time, those sites have already been
     * filled in by a previous iteration. This version has the
     * locality of compute_at(f, x), but allocates more memory and
     * does much less redundant work.
     *
     * Halide then further optimizes this pipeline like so:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         int g[2][2];
         for (int x = 0; x < width; x++) {
             if (x == 0) {
                 g[0][0] = x*y;
                 g[1][0] = x*(y+1);
             }
             g[0][(x+1)%2] = (x+1)*y;
             g[1][(x+1)%2] = (x+1)*(y+1);
             f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
         }
     }
     \endcode
     *
     * Halide has detected that it's possible to use a circular buffer
     * to represent g, and has reduced all accesses to g modulo 2 in
     * the x dimension. This optimization only triggers if the for
     * loop over x is serial, and if halide can statically determine
     * some power of two large enough to cover the range needed. For
     * powers of two, the modulo operator compiles to more efficient
     * bit-masking. This optimization reduces memory usage, and also
     * improves locality by reusing recently-accessed memory instead
     * of pulling new memory into cache.
     *
     */
    Func &store_at(const Func &f, const Var &var);

    /** Equivalent to the version of store_at that takes a Var, but
     * schedules storage within the loop over a dimension of a
     * reduction domain */
    Func &store_at(const Func &f, const RVar &var);

    /** Equivalent to the version of store_at that takes a Var, but
     * schedules storage at a given LoopLevel. */
    Func &store_at(LoopLevel loop_level);

    /** Equivalent to \ref Func::store_at, but schedules storage
     * outside the outermost loop. */
    Func &store_root();

    /** Hoist storage for this function within f's loop over
     * var. This is different from \ref Func::store_at, because hoist_storage
     * simply moves an actual allocation to a given loop level and
     * doesn't trigger any of the optimizations such as sliding window.
     * Hoisting storage is optional and can be used as an optimization
     * to avoid unnecessary allocations by moving it out from an inner
     * loop.
     *
     * Consider again the pipeline from \ref Func::compute_at :
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
     \endcode
     *
     * If we schedule f like so:
     *
     \code
     g.compute_at(f, x);
     \endcode
     *
     * Then the C code equivalent to this pipeline will look like this
     *
     \code

     int f[height][width];
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             int g[2][2];
             g[0][0] = x*y;
             g[0][1] = (x+1)*y;
             g[1][0] = x*(y+1);
             g[1][1] = (x+1)*(y+1);
             f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
         }
     }

     \endcode
     *
     * Note the allocation for g inside of the loop over variable x which
     * can happen for each iteration of the inner loop (in total height * width times).
     * In some cases allocation can be expensive, so it might be better to do it once
     * and reuse allocated memory across all iterations of the loop.
     *
     * This can be done by scheduling g like so:
     *
     \code
     g.compute_at(f, x).hoist_storage(f, Var::outermost());
     \endcode
     *
     * Then the C code equivalent to this pipeline will look like this
     *
     \code

     int f[height][width];
     int g[2][2];
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             g[0][0] = x*y;
             g[0][1] = (x+1)*y;
             g[1][0] = x*(y+1);
             g[1][1] = (x+1)*(y+1);
             f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
         }
     }

     \endcode
     *
     * hoist_storage can be used together with \ref Func::store_at and
     * \ref Func::fold_storage (for example, to hoist the storage allocated
     * after sliding window optimization).
     *
     */
    Func &hoist_storage(const Func &f, const Var &var);

    /** Equivalent to the version of hoist_storage that takes a Var, but
     * schedules storage within the loop over a dimension of a
     * reduction domain */
    Func &hoist_storage(const Func &f, const RVar &var);

    /** Equivalent to the version of hoist_storage that takes a Var, but
     * schedules storage at a given LoopLevel. */
    Func &hoist_storage(LoopLevel loop_level);

    /** Equivalent to \ref Func::hoist_storage_root, but schedules storage
     * outside the outermost loop. */
    Func &hoist_storage_root();

    /** Aggressively inline all uses of this function. This is the
     * default schedule, so you're unlikely to need to call this. For
     * a Func with an update definition, that means it gets computed
     * as close to the innermost loop as possible.
     *
     * Consider once more the pipeline from \ref Func::compute_at :
     *
     \code
     Func f, g;
     Var x, y;
     g(x, y) = x*y;
     f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
     \endcode
     *
     * Leaving g as inline, this compiles to code equivalent to the following C:
     *
     \code
     int f[height][width];
     for (int y = 0; y < height; y++) {
         for (int x = 0; x < width; x++) {
             f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
         }
     }
     \endcode
     */
    Func &compute_inline();

    /** Get a handle on an update step for the purposes of scheduling
     * it. */
    Stage update(int idx = 0);

    /** Set the type of memory this Func should be stored in. Controls
     * whether allocations go on the stack or the heap on the CPU, and
     * in global vs shared vs local on the GPU. See the documentation
     * on MemoryType for more detail. */
    Func &store_in(MemoryType memory_type);

    /** Trace all loads from this Func by emitting calls to
     * halide_trace. If the Func is inlined, this has no
     * effect. */
    Func &trace_loads();

    /** Trace all stores to the buffer backing this Func by emitting
     * calls to halide_trace. If the Func is inlined, this call
     * has no effect. */
    Func &trace_stores();

    /** Trace all realizations of this Func by emitting calls to
     * halide_trace. */
    Func &trace_realizations();

    /** Add a string of arbitrary text that will be passed thru to trace
     * inspection code if the Func is realized in trace mode. (Funcs that are
     * inlined won't have their tags emitted.) Ignored entirely if
     * tracing is not enabled for the Func (or globally).
     */
    Func &add_trace_tag(const std::string &trace_tag);

    /** Marks this function as a function that should not be profiled
     * when using the target feature Profile or ProfileByTimer.
     * This is useful when this function is does too little work at once
     * such that the overhead of setting the profiling token might
     * become significant, or that the measured time is not representative
     * due to modern processors (instruction level parallelism, out-of-order
     * execution). */
    Func &no_profiling();

    /** Get a handle on the internal halide function that this Func
     * represents. Useful if you want to do introspection on Halide
     * functions */
    Internal::Function function() const {
        return func;
    }

    /** You can cast a Func to its pure stage for the purposes of
     * scheduling it. */
    operator Stage() const;

    /** Get a handle on the output buffer for this Func. Only relevant
     * if this is the output Func in a pipeline. Useful for making
     * static promises about strides, mins, and extents. */
    // @{
    OutputImageParam output_buffer() const;
    std::vector<OutputImageParam> output_buffers() const;
    // @}

    /** Use a Func as an argument to an external stage. */
    operator ExternFuncArgument() const;

    /** Infer the arguments to the Func, sorted into a canonical order:
     * all buffers (sorted alphabetically by name), followed by all non-buffers
     * (sorted alphabetically by name).
     This lets you write things like:
     \code
     func.compile_to_assembly("/dev/stdout", func.infer_arguments());
     \endcode
     */
    std::vector<Argument> infer_arguments() const;

    /** Return the current StageSchedule associated with this initial
     * Stage of this Func. For introspection only: to modify schedule,
     * use the Func interface. */
    const Internal::StageSchedule &get_schedule() const {
        return Stage(*this).get_schedule();
    }
};

namespace Internal {

template<typename Last>
inline void check_types(const Tuple &t, int idx) {
    using T = std::remove_pointer_t<std::remove_reference_t<Last>>;
    user_assert(t[idx].type() == type_of<T>())
        << "Can't evaluate expression "
        << t[idx] << " of type " << t[idx].type()
        << " as a scalar of type " << type_of<T>() << "\n";
}

template<typename First, typename Second, typename... Rest>
inline void check_types(const Tuple &t, int idx) {
    check_types<First>(t, idx);
    check_types<Second, Rest...>(t, idx + 1);
}

template<typename Last>
inline void assign_results(Realization &r, int idx, Last last) {
    using T = std::remove_pointer_t<std::remove_reference_t<Last>>;
    *last = Buffer<T>(r[idx])();
}

template<typename First, typename Second, typename... Rest>
inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
    assign_results<First>(r, idx, first);
    assign_results<Second, Rest...>(r, idx + 1, second, rest...);
}

}  // namespace Internal

/** JIT-Compile and run enough code to evaluate a Halide
 * expression. This can be thought of as a scalar version of
 * \ref Func::realize */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e) {
    user_assert(e.type() == type_of<T>())
        << "Can't evaluate expression "
        << e << " of type " << e.type()
        << " as a scalar of type " << type_of<T>() << "\n";
    Func f;
    f() = e;
    Buffer<T, 0> im = f.realize(ctx);
    return im();
}

/** evaluate with a default user context */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e) {
    return evaluate<T>(nullptr, e);
}

/** JIT-compile and run enough code to evaluate a Halide Tuple. */
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
    Internal::check_types<First, Rest...>(t, 0);

    Func f;
    f() = t;
    Realization r = f.realize(ctx);
    Internal::assign_results(r, 0, first, rest...);
}

/** JIT-compile and run enough code to evaluate a Halide Tuple. */
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
    evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
}

namespace Internal {

inline void schedule_scalar(Func f) {
    Target t = get_jit_target_from_environment();
    if (t.has_gpu_feature()) {
        f.gpu_single_thread();
    }
    if (t.has_feature(Target::HVX)) {
        f.hexagon();
    }
}

}  // namespace Internal

/** JIT-Compile and run enough code to evaluate a Halide
 * expression. This can be thought of as a scalar version of
 * \ref Func::realize. Can use GPU if jit target from environment
 * specifies one.
 */
template<typename T>
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e) {
    user_assert(e.type() == type_of<T>())
        << "Can't evaluate expression "
        << e << " of type " << e.type()
        << " as a scalar of type " << type_of<T>() << "\n";
    Func f;
    f() = e;
    Internal::schedule_scalar(f);
    Buffer<T, 0> im = f.realize();
    return im();
}

/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
 *  use GPU if jit target from environment specifies one. */
// @{
template<typename First, typename... Rest>
HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
    Internal::check_types<First, Rest...>(t, 0);

    Func f;
    f() = t;
    Internal::schedule_scalar(f);
    Realization r = f.realize();
    Internal::assign_results(r, 0, first, rest...);
}
// @}

}  // namespace Halide

#endif

namespace Halide {

class GeneratorContext;
using GeneratorParamsMap = std::map<std::string, std::string>;

namespace Internal {

enum class ArgInfoKind { Scalar,
                         Function,
                         Buffer };

enum class ArgInfoDirection { Input,
                              Output };

/**
 * AbstractGenerator is an ABC that defines the API a Generator must provide
 * to work with the existing Generator infrastructure (GenGen, RunGen, execute_generator(),
 * Generator Stubs). The existing Generator<>-based instances all implement
 * this API, but any other code that implements this (and uses RegisterGenerator
 * to register itself) should be indistinguishable from a user perspective.
 *
 * An AbstractGenerator is meant to be "single-use"; typically lifetimes will be
 * something like:
 * - create an instance (with a specific Target)
 * - optionally set GeneratorParam values
 * - optionally re-bind inputs (if using in JIT or Stub modes)
 * - call build_pipeline()
 * - optionally call output_func() to get the output(s) (if using in JIT or Stub modes)
 * - discard the instance
 *
 * AbstractGenerators should be fairly cheap to instantiate! Don't try to re-use
 * one by re-setting inputs and calling build_pipeline() multiple times.
 *
 * Note that an AbstractGenerator instance is (generally) stateful in terms of the order
 * that methods should be called; calling the methods out of order may cause
 * assert-fails or other undesirable behavior. Read the method notes carefully!
 */
class AbstractGenerator {
public:
    virtual ~AbstractGenerator() = default;

    /** ArgInfo is a struct to contain name-and-type information for the inputs and outputs to
     * the Pipeline that build_pipeline() will return.
     *
     * Note that this looks rather similar to Halide::Argument, but unfortunately
     * that is not a good fit here, as it cannot represent Func inputs (only
     * Buffer and Scalar), nor can it really handle Outputs.
     */
    struct ArgInfo {
        std::string name;
        ArgInfoDirection dir = ArgInfoDirection::Input;
        ArgInfoKind kind = ArgInfoKind::Scalar;
        // Note that this can have multiple entries for Tuple-valued Inputs or Outputs
        std::vector<Type> types;
        int dimensions = 0;
    };

    /** Return the name of this Generator. (This should always be the name
     * used to register it.) */
    virtual std::string name() = 0;

    /** Return the Target and autoscheduler info that this Generator
     * was created with. Always legal to call on any AbstractGenerator instance,
     * regardless of what other methods have been called. (All AbstractGenerator instances
     * are expected to be created with immutable values for these, which can't be
     * changed for a given instance after creation. Note that Generator<> based subclasses
     * can customize Target somewhat via init_from_context(); see Generator.h for more info.)
     *
     * CALL-AFTER: any
     * CALL-BEFORE: any
     */
    virtual GeneratorContext context() const = 0;

    /** Return a list of all the ArgInfos for this generator. The list will be in the order
     * that the input and outputs are declared (possibly interleaved).
     * Any inputs or outputs added by a configure() method will be in the list,
     * at the end, in the order added.
     * All input and output names will be unique within a given Generator instance.
     *
     * CALL-AFTER: configure()
     * CALL-BEFORE: any
     */
    virtual std::vector<ArgInfo> arginfos() = 0;

    /** Set the value for a specific GeneratorParam for an AbstractGenerator instance.
     *
     * Names that aren't known generator names should assert-fail.
     *
     * Values that can't be parsed for the specific GeneratorParam (e.g. passing "foo" where
     * an integer is expected) should assert-fail at some point (either immediately, or when
     * build_pipeline() is called)
     *
     * This can be called multiple times, but only prior to build_pipeline().
     *
     * CALL-AFTER: none
     * CALL-BEFORE: build_pipeline
     */
    // @{
    virtual void set_generatorparam_value(const std::string &name, const std::string &value) = 0;
    virtual void set_generatorparam_value(const std::string &name, const LoopLevel &loop_level) = 0;
    // @}

    /** Build and return the Pipeline for this AbstractGenerator. This method should be called
     * only once per instance.
     *
     * CALL-AFTER: set_generatorparam_value, bind_input
     * CALL-BEFORE: input_parameter, output_func, external_code_map
     */
    virtual Pipeline build_pipeline() = 0;

    /** Given the name of an input, return the Parameter(s) for that input.
     * (Most inputs will have exactly one, but inputs that are declared as arrays
     * will have multiple.)
     *
     * CALL-AFTER: build_pipeline
     * CALL-BEFORE: none
     */
    virtual std::vector<Parameter> input_parameter(const std::string &name) = 0;

    /** Given the name of an output, return the Func(s) for that output.
     *
     * Most outputs will have exactly one, but outputs that are declared as arrays will have multiple.
     *
     * Note that outputs with Tuple values are still just a single Func, though they do get realized
     * as multiple Buffers.
     *
     * Must be called after build_pipeline(), since the output Funcs will be undefined prior to that.
     *
     * CALL-AFTER: build_pipeline()
     * CALL-BEFORE: none
     */
    virtual std::vector<Func> output_func(const std::string &name) = 0;

    /** Rebind a specified Input to refer to the given piece of IR, replacing the
     * default ImageParam / Param in place for that Input. Basic type-checking is
     * done to ensure that inputs are still sane (e.g. types, dimensions, etc must match expectations).
     *
     * CALL-AFTER: set_generatorparam_value
     * CALL-BEFORE: build_pipeline
     */
    // @{
    virtual void bind_input(const std::string &name, const std::vector<Parameter> &v) = 0;
    virtual void bind_input(const std::string &name, const std::vector<Func> &v) = 0;
    virtual void bind_input(const std::string &name, const std::vector<Expr> &v) = 0;
    // @}

    /** Emit a Generator Stub (.stub.h) file to the given path. Not all Generators support this.
     *
     * If you call this method, you should not call any other AbstractGenerator methods
     * on this instance, before or after this call.
     *
     * If the Generator is capable of emitting a Stub, do so and return true. (Errors
     * during stub emission should assert-fail rather than returning false.)
     *
     * If the Generator is not capable of emitting a Stub, do nothing and return false.
     *
     * CALL-AFTER: none
     * CALL-BEFORE: none
     */
    virtual bool emit_cpp_stub(const std::string &stub_file_path) = 0;

    /** Emit a Serialized Halide Pipeline (.hlpipe) file to the given path. Not all Generators support this.
     *
     * If you call this method, you should not call any other AbstractGenerator methods
     * on this instance, before or after this call.
     *
     * If the Generator is capable of emitting an hlpipe, do so and return true. (Errors
     * during hlpipe emission should assert-fail rather than returning false.)
     *
     * If the Generator is not capable of emitting an hlpipe, do nothing and return false.
     *
     * CALL-AFTER: none
     * CALL-BEFORE: none
     */
    virtual bool emit_hlpipe(const std::string &hlpipe_file_path) = 0;

    /** By default, a Generator must declare all Inputs before all Outputs.
     *  In some unusual situations (e.g. metaprogramming situations), it's
     * desirable to allow them to be declared out-of-order and put the onus
     * of a non-obvious call order on the coder; a Generator may override this
     * to return 'true' to allow this behavior.
     */
    virtual bool allow_out_of_order_inputs_and_outputs() const = 0;

    // Below are some concrete methods that build on top of the rest of the AbstractGenerator API.
    // Note that they are nonvirtual. TODO: would these be better as freestanding methods that
    // just take AbstractGeneratorPtr as arguments?

    /** Call generate() and produce a Module for the result.
     *If function_name is empty, generator_name() will be used for the function. */
    Module build_module(const std::string &function_name = "");

    /**
     * Build a module that is suitable for using for gradient descent calculation in TensorFlow or PyTorch.
     *
     * Essentially:
     *   - A new Pipeline is synthesized from the current Generator (according to the rules below)
     *   - The new Pipeline is autoscheduled (if autoscheduling is requested, but it would be odd not to do so)
     *   - The Pipeline is compiled to a Module and returned
     *
     * The new Pipeline is adjoint to the original; it has:
     *   - All the same inputs as the original, in the same order
     *   - Followed by one grad-input for each original output
     *   - Followed by one output for each unique pairing of original-output + original-input.
     *     (For the common case of just one original-output, this amounts to being one output for each original-input.)
     */
    Module build_gradient_module(const std::string &function_name);

    /**
     * JIT the AbstractGenerator into a Callable (using the currently-set
     * Target) and return it.
     *
     * If jit_handlers is not null, set the jitted func's jit_handlers to use a copy of it.
     *
     * If jit_externs is not null, use it to set the jitted func's external dependencies.
     */
    Callable compile_to_callable(const JITHandlers *jit_handlers = nullptr,
                                 const std::map<std::string, JITExtern> *jit_externs = nullptr);

    /*
     * Set all the GeneratorParams in the map. This is equivalent to simply calling the
     * `set_generatorparam_value()` method in a loop over the map, but is quite convenient. */
    void set_generatorparam_values(const GeneratorParamsMap &m);
};

using AbstractGeneratorPtr = std::unique_ptr<AbstractGenerator>;

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ADD_ATOMIC_MUTEX_H
#define HALIDE_ADD_ATOMIC_MUTEX_H

#include <map>

/** \file
 * Defines the lowering pass that insert mutex allocation code & locks
 * for the atomic nodes that require mutex locks. It also checks whether
 * the atomic operation is valid. It rejects algorithms that have indexing
 * on left-hand-side which references the buffer itself, e.g.
 * f(clamp(f(r), 0, 100)) = f(r) + 1
 * If the SplitTuple pass does not lift out the Provide value as a let
 * expression. This is confirmed by checking whether the Provide nodes
 * inside an Atomic node  have let binding values accessing the buffers
 * inside the atomic node.
 * Finally, it lifts the store indexing expressions inside the atomic node
 * outside of the atomic to avoid side-effects inside those expressions
 * being evaluated twice. */

namespace Halide {
namespace Internal {

class Function;

Stmt add_atomic_mutex(Stmt s, const std::vector<Function> &outputs);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_ADD_IMAGE_CHECKS_H
#define HALIDE_INTERNAL_ADD_IMAGE_CHECKS_H

/** \file
 *
 * Defines the lowering pass that adds the assertions that validate
 * input and output buffers.
 */
#include <map>
#include <string>
#include <vector>

#ifndef HALIDE_BOUNDS_H
#define HALIDE_BOUNDS_H

/** \file
 * Methods for computing the upper and lower bounds of an expression,
 * and the regions of a function read or written by a statement.
 */

#ifndef HALIDE_INTERVAL_H
#define HALIDE_INTERVAL_H

/** \file
 * Defines the Interval class
 */


namespace Halide {
namespace Internal {

/** A class to represent ranges of Exprs. Can be unbounded above or below. */
struct Interval {

    /** Exprs to represent positive and negative infinity */
#ifdef COMPILING_HALIDE
    static HALIDE_ALWAYS_INLINE Expr pos_inf() {
        return pos_inf_expr;
    }
    static HALIDE_ALWAYS_INLINE Expr neg_inf() {
        return neg_inf_expr;
    }
#else
    static Expr pos_inf() {
        return pos_inf_noinline();
    }
    static Expr neg_inf() {
        return neg_inf_noinline();
    }
#endif

    /** The lower and upper bound of the interval. They are included
     * in the interval. */
    Expr min, max;

    /** A default-constructed Interval is everything */
    Interval()
        : min(neg_inf()), max(pos_inf()) {
    }

    /** Construct an interval from a lower and upper bound. */
    Interval(const Expr &min, const Expr &max)
        : min(min), max(max) {
        internal_assert(min.defined() && max.defined());
    }

    /** The interval representing everything. */
    static Interval everything();

    /** The interval representing nothing. */
    static Interval nothing();

    /** Construct an interval representing a single point */
    static Interval single_point(const Expr &e);

    /** Is the interval the empty set */
    bool is_empty() const;

    /** Is the interval the entire range */
    bool is_everything() const;

    /** Is the interval just a single value (min == max) */
    bool is_single_point() const;

    /** Is the interval a particular single value */
    bool is_single_point(const Expr &e) const;

    /** Does the interval have a finite least upper bound */
    bool has_upper_bound() const;

    /** Does the interval have a finite greatest lower bound */
    bool has_lower_bound() const;

    /** Does the interval have a finite upper and lower bound */
    bool is_bounded() const;

    /** Is the interval the same as another interval */
    bool same_as(const Interval &other) const;

    /** Expand the interval to include another Interval */
    void include(const Interval &i);

    /** Expand the interval to include an Expr */
    void include(const Expr &e);

    /** Construct the smallest interval containing two intervals. */
    static Interval make_union(const Interval &a, const Interval &b);

    /** Construct the largest interval contained within two other intervals. */
    static Interval make_intersection(const Interval &a, const Interval &b);

    /** An eagerly-simplifying max of two Exprs that respects infinities. */
    static Expr make_max(const Expr &a, const Expr &b);

    /** An eagerly-simplifying min of two Exprs that respects infinities. */
    static Expr make_min(const Expr &a, const Expr &b);

    /** Equivalent to same_as. Exists so that the autoscheduler can
     * compare two map<string, Interval> for equality in order to
     * cache computations. */
    bool operator==(const Interval &other) const;

private:
    static Expr neg_inf_expr, pos_inf_expr;

    // Never used inside libHalide; provided for Halide tests, to avoid needing to export
    // data fields in some build environments.
    static Expr pos_inf_noinline();
    static Expr neg_inf_noinline();
};

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {
namespace Internal {

class Function;

typedef std::map<std::pair<std::string, int>, Interval> FuncValueBounds;

const FuncValueBounds &empty_func_value_bounds();

/** Given an expression in some variables, and a map from those
 * variables to their bounds (in the form of (minimum possible value,
 * maximum possible value)), compute two expressions that give the
 * minimum possible value and the maximum possible value of this
 * expression. Max or min may be undefined expressions if the value is
 * not bounded above or below. If the expression is a vector, also
 * takes the bounds across the vector lanes and returns a scalar
 * result.
 *
 * This is for tasks such as deducing the region of a buffer
 * loaded by a chunk of code.
 */
Interval bounds_of_expr_in_scope(const Expr &expr,
                                 const Scope<Interval> &scope,
                                 const FuncValueBounds &func_bounds = empty_func_value_bounds(),
                                 bool const_bound = false);

/** Given a varying expression, try to find a constant that is either:
 * An upper bound (always greater than or equal to the expression), or
 * A lower bound (always less than or equal to the expression)
 * If it fails, returns an undefined Expr. */
enum class Direction { Upper,
                       Lower };
Expr find_constant_bound(const Expr &e, Direction d,
                         const Scope<Interval> &scope = Scope<Interval>::empty_scope());

/** Find bounds for a varying expression that are either constants or
 * +/-inf. */
Interval find_constant_bounds(const Expr &e, const Scope<Interval> &scope);

/** Represents the bounds of a region of arbitrary dimension. Zero
 * dimensions corresponds to a scalar region. */
struct Box {
    /** The conditions under which this region may be touched. */
    Expr used;

    /** The bounds if it is touched. */
    std::vector<Interval> bounds;

    Box() = default;
    explicit Box(size_t sz)
        : bounds(sz) {
    }
    explicit Box(const std::vector<Interval> &b)
        : bounds(b) {
    }

    size_t size() const {
        return bounds.size();
    }
    bool empty() const {
        return bounds.empty();
    }
    Interval &operator[](size_t i) {
        return bounds[i];
    }
    const Interval &operator[](size_t i) const {
        return bounds[i];
    }
    void resize(size_t sz) {
        bounds.resize(sz);
    }
    void push_back(const Interval &i) {
        bounds.push_back(i);
    }

    /** Check if the used condition is defined and not trivially true. */
    bool maybe_unused() const;

    friend std::ostream &operator<<(std::ostream &stream, const Box &b);
};

/** Expand box a to encompass box b */
void merge_boxes(Box &a, const Box &b);

/** Test if box a could possibly overlap box b. */
bool boxes_overlap(const Box &a, const Box &b);

/** The union of two boxes */
Box box_union(const Box &a, const Box &b);

/** The intersection of two boxes */
Box box_intersection(const Box &a, const Box &b);

/** Test if box a provably contains box b */
bool box_contains(const Box &a, const Box &b);

/** Compute rectangular domains large enough to cover all the 'Call's to each
 * function that occurs within a given statement or expression. This is useful
 * for figuring out what regions of things to evaluate. Respects control flow
 * (e.g. encodes if statement conditions), but assumes all encountered asserts
 * pass. If it encounters an assert(false) in one if branch, assumes the
 * opposite if branch runs unconditionally. */
// @{
std::map<std::string, Box> boxes_required(const Expr &e,
                                          const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                                          const FuncValueBounds &func_bounds = empty_func_value_bounds());
std::map<std::string, Box> boxes_required(Stmt s,
                                          const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                                          const FuncValueBounds &func_bounds = empty_func_value_bounds());
// @}

/** Compute rectangular domains large enough to cover all the 'Provides's to
 * each function that occurs within a given statement or expression. Handles
 * asserts in the same way as boxes_required. */
// @{
std::map<std::string, Box> boxes_provided(const Expr &e,
                                          const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                                          const FuncValueBounds &func_bounds = empty_func_value_bounds());
std::map<std::string, Box> boxes_provided(Stmt s,
                                          const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                                          const FuncValueBounds &func_bounds = empty_func_value_bounds());
// @}

/** Compute rectangular domains large enough to cover all the 'Call's and
 * 'Provides's to each function that occurs within a given statement or
 * expression. Handles asserts in the same way as boxes_required. */
// @{
std::map<std::string, Box> boxes_touched(const Expr &e,
                                         const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                                         const FuncValueBounds &func_bounds = empty_func_value_bounds());
std::map<std::string, Box> boxes_touched(Stmt s,
                                         const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                                         const FuncValueBounds &func_bounds = empty_func_value_bounds());
// @}

/** Variants of the above that are only concerned with a single function. */
// @{
Box box_required(const Expr &e, const std::string &fn,
                 const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                 const FuncValueBounds &func_bounds = empty_func_value_bounds());
Box box_required(Stmt s, const std::string &fn,
                 const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                 const FuncValueBounds &func_bounds = empty_func_value_bounds());

Box box_provided(const Expr &e, const std::string &fn,
                 const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                 const FuncValueBounds &func_bounds = empty_func_value_bounds());
Box box_provided(Stmt s, const std::string &fn,
                 const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                 const FuncValueBounds &func_bounds = empty_func_value_bounds());

Box box_touched(const Expr &e, const std::string &fn,
                const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                const FuncValueBounds &func_bounds = empty_func_value_bounds());
Box box_touched(Stmt s, const std::string &fn,
                const Scope<Interval> &scope = Scope<Interval>::empty_scope(),
                const FuncValueBounds &func_bounds = empty_func_value_bounds());
// @}

/** Compute the maximum and minimum possible value for each function
 * in an environment. */
FuncValueBounds compute_function_value_bounds(const std::vector<std::string> &order,
                                              const std::map<std::string, Function> &env);

/* Find an upper bound of bounds.max - bounds.min. */
Expr span_of_bounds(const Interval &bounds);

void bounds_test();

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {

struct Target;

namespace Internal {

class Function;

/** Insert checks to make sure a statement doesn't read out of bounds
 * on inputs or outputs, and that the inputs and outputs conform to
 * the format required (e.g. stride.0 must be 1).
 */
Stmt add_image_checks(const Stmt &s,
                      const std::vector<Function> &outputs,
                      const Target &t,
                      const std::vector<std::string> &order,
                      const std::map<std::string, Function> &env,
                      const FuncValueBounds &fb,
                      bool will_inject_host_copies);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_ADD_PARAMETER_CHECKS_H
#define HALIDE_INTERNAL_ADD_PARAMETER_CHECKS_H

/** \file
 *
 * Defines the lowering pass that adds the assertions that validate
 * scalar parameters.
 */
#include <vector>


namespace Halide {
namespace Internal {

/** Insert checks to make sure that all referenced parameters meet
 * their constraints. Also injects any custom requirements provided
 * by the user. */
Stmt add_parameter_checks(const std::vector<Stmt> &requirements, Stmt s, const Target &t);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_ADD_SPLIT_FACTOR_CHECKS_H
#define HALIDE_INTERNAL_ADD_SPLIT_FACTOR_CHECKS_H

/** \file
 *
 * Defines the lowering pass that adds the assertions that all split factors are
 * strictly positive.
 */
#include <map>


namespace Halide {
namespace Internal {

class Function;

/** Insert checks that all split factors that depend on scalar parameters are
 * strictly positive. */
Stmt add_split_factor_checks(const Stmt &s, const std::map<std::string, Function> &env);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ALIGN_LOADS_H
#define HALIDE_ALIGN_LOADS_H

/** \file
 * Defines a lowering pass that rewrites unaligned loads into
 * sequences of aligned loads.
 */

namespace Halide {
namespace Internal {

/** Attempt to rewrite unaligned loads from buffers which are known to
 * be aligned to instead load aligned vectors that cover the original
 * load, and then slice the original load out of the aligned
 * vectors.
 *
 * Types that are less than min_bytes_to_align in size are not
 * rewritten. This is intended to make a distinction between data that
 * will be accessed as a scalar and that which will be accessed as a
 * vector.
 */
Stmt align_loads(const Stmt &s, int alignment, int min_bytes_to_align);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ALLOCATION_BOUNDS_INFERENCE_H
#define HALIDE_ALLOCATION_BOUNDS_INFERENCE_H

/** \file
 * Defines the lowering pass that determines how large internal allocations should be.
 */
#include <map>
#include <string>
#include <utility>


namespace Halide {
namespace Internal {

class Function;

/** Take a partially statement with Realize nodes in terms of
 * variables, and define values for those variables. */
Stmt allocation_bounds_inference(Stmt s,
                                 const std::map<std::string, Function> &env,
                                 const std::map<std::pair<std::string, int>, Interval> &func_bounds);
}  // namespace Internal
}  // namespace Halide

#endif
#ifndef APPLY_SPLIT_H
#define APPLY_SPLIT_H

/** \file
 *
 * Defines method that returns a list of let stmts, substitutions, and
 * predicates to be added given a split schedule.
 */

#include <map>
#include <string>
#include <utility>
#include <vector>


namespace Halide {
namespace Internal {

struct ApplySplitResult {
    // If type is "Substitution", then this represents a substitution of
    // variable "name" to value. ProvideValueSubstitution and
    // ProvideArgSubstitution are similar, but only apply to instances
    // found on the LHS or RHS of a call or provide. respectively. If
    // type is "LetStmt", we should insert a new let stmt defining
    // "name" with value "value". If type is "Predicate", we should
    // ignore "name" and the predicate is "value".

    std::string name;
    Expr value;

    enum Type { Substitution = 0,
                SubstitutionInCalls,
                SubstitutionInProvides,
                LetStmt,
                PredicateCalls,
                PredicateProvides,
                Predicate,
                BlendProvides };
    Type type;

    ApplySplitResult(const std::string &n, Expr val, Type t)
        : name(n), value(std::move(val)), type(t) {
    }
    ApplySplitResult(Expr val, Type t = Predicate)
        : name(""), value(std::move(val)), type(t) {
    }
};

/** Given a Split schedule on a definition (init or update), return a list of
 * of predicates on the definition, substitutions that needs to be applied to
 * the definition (in ascending order of application), and let stmts which
 * defined the values of variables referred by the predicates and substitutions
 * (ordered from innermost to outermost let). */
std::vector<ApplySplitResult> apply_split(
    const Split &split, const std::string &prefix,
    std::map<std::string, Expr> &dim_extent_alignment);

/** Compute the loop bounds of the new dimensions resulting from applying the
 * split schedules using the loop bounds of the old dimensions. */
std::vector<std::pair<std::string, Expr>> compute_loop_bounds_after_split(
    const Split &split, const std::string &prefix);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ASSOCIATIVE_OPS_TABLE_H
#define HALIDE_ASSOCIATIVE_OPS_TABLE_H

/** \file
 * Tables listing associative operators and their identities.
 */

#ifndef HALIDE_IR_EQUALITY_H
#define HALIDE_IR_EQUALITY_H

/** \file
 * Methods to test Exprs and Stmts for equality of value.
 *
 * These methods traverse the entire IR tree. For equality of reference, use
 * Expr::same_as. If you're comparing non-CSE'd Exprs, use graph_equal or
 * graph_less_than, which is safe for nasty graphs of IR nodes.
 */


namespace Halide {
namespace Internal {

// We want to inline a few quick checks into the caller. These are the actual
// implementations that get called after those quick checks.
bool equal_impl(const IRNode &a, const IRNode &b);
bool graph_equal_impl(const IRNode &a, const IRNode &b);
bool less_than_impl(const IRNode &a, const IRNode &b);
bool graph_less_than_impl(const IRNode &a, const IRNode &b);

/** Compare an Expr to an int literal. This is a somewhat common use of equal in
 * tests. Making this separate avoids constructing an Expr out of the int
 * literal just to check if it's equal to a. */
HALIDE_ALWAYS_INLINE
bool equal(const Expr &a, int b) {
    if (const IntImm *i = a.as<IntImm>()) {
        return (a.type() == Int(32) && i->value == b);
    } else {
        return false;
    }
}

/** Check if two defined Stmts or Exprs are equal. */
HALIDE_ALWAYS_INLINE
bool equal(const IRNode &a, const IRNode &b) {
    if (&a == &b) {
        return true;
    } else if (a.node_type != b.node_type) {
        return false;
    } else {
        return equal_impl(a, b);
    }
}

/** Check if two possible-undefined Stmts or Exprs are equal. */
HALIDE_ALWAYS_INLINE
bool equal(const IRHandle &a, const IRHandle &b) {
    if (!a.defined()) {
        return !b.defined();
    } else if (!b.defined()) {
        return false;
    } else {
        return equal(*(a.get()), *(b.get()));
    }
}

/** Check if two defined Stmts or Exprs are equal. Safe to call on Exprs that
 * haven't been passed to common_subexpression_elimination. */
HALIDE_ALWAYS_INLINE
bool graph_equal(const IRNode &a, const IRNode &b) {
    if (&a == &b) {
        return true;
    } else if (a.node_type != b.node_type) {
        return false;
    } else {
        return graph_equal_impl(a, b);
    }
}

/** Check if two possibly-undefined Stmts or Exprs are equal. Safe to call on
 * Exprs that haven't been passed to common_subexpression_elimination. */
HALIDE_ALWAYS_INLINE
bool graph_equal(const IRHandle &a, const IRHandle &b) {
    if (!a.defined()) {
        return !b.defined();
    } else if (!b.defined()) {
        return false;
    } else {
        return graph_equal(*(a.get()), *(b.get()));
    }
}

/** Check if two defined Stmts or Exprs are in a lexicographic order. For use in
 * map keys. */
HALIDE_ALWAYS_INLINE
bool less_than(const IRNode &a, const IRNode &b) {
    if (&a == &b) {
        return false;
    } else if (a.node_type < b.node_type) {
        return true;
    } else {
        return less_than_impl(a, b);
    }
}

/** Check if two possibly-undefined Stmts or Exprs are in a lexicographic
 * order. For use in map keys. */
HALIDE_ALWAYS_INLINE
bool less_than(const IRHandle &a, const IRHandle &b) {
    if (a.get() == b.get()) {
        return false;
    } else if (!a.defined()) {
        return true;
    } else if (!b.defined()) {
        return false;
    } else {
        return less_than(*(a.get()), *(b.get()));
    }
}

/** Check if two defined Stmts or Exprs are in a lexicographic order. For use in
 * map keys. Safe to use on Exprs that haven't been passed to
 * common_subexpression_elimination. */
HALIDE_ALWAYS_INLINE
bool graph_less_than(const IRNode &a, const IRNode &b) {
    if (&a == &b) {
        return false;
    } else if (a.node_type < b.node_type) {
        return true;
    } else {
        return graph_less_than_impl(a, b);
    }
}

/** Check if two possibly-undefined Stmts or Exprs are in a lexicographic
 * order. For use in map keys. Safe to use on Exprs that haven't been passed to
 * common_subexpression_elimination. */
HALIDE_ALWAYS_INLINE
bool graph_less_than(const IRHandle &a, const IRHandle &b) {
    if (a.get() == b.get()) {
        return false;
    } else if (!a.defined()) {
        return true;
    } else if (!b.defined()) {
        return false;
    } else {
        return graph_less_than(*(a.get()), *(b.get()));
    }
}

/** A compare struct built around less_than, for use as the comparison
 * object in a std::map or std::set. */
struct IRDeepCompare {
    bool operator()(const IRHandle &a, const IRHandle &b) const {
        return less_than(a, b);
    }
};

/** A compare struct built around graph_less_than, for use as the comparison
 * object in a std::map or std::set. */
struct IRGraphDeepCompare {
    bool operator()(const IRHandle &a, const IRHandle &b) const {
        return graph_less_than(a, b);
    }
};

void ir_equality_test();

}  // namespace Internal
}  // namespace Halide

#endif

#include <utility>
#include <vector>

namespace Halide {
namespace Internal {

/**
 * Represent an associative op with its identity. The op may be multi-dimensional,
 * e.g. complex multiplication. 'is_commutative' is set to true if the op is also
 * commutative in addition to being associative.
 *
 * For example, complex multiplication is represented as:
 \code
 AssociativePattern pattern(
    {x0 * y0 - x1 * y1, x1 * y0 + x0 * y1},
    {one, zero},
    true
 );
 \endcode
 */
struct AssociativePattern {
    /** Contain the binary operators for each dimension of the associative op. */
    std::vector<Expr> ops;
    /** Contain the identities for each dimension of the associative op. */
    std::vector<Expr> identities;
    /** Indicate if the associative op is also commutative. */
    bool is_commutative = false;

    AssociativePattern() = default;
    AssociativePattern(size_t size)
        : ops(size), identities(size) {
    }
    AssociativePattern(const std::vector<Expr> &ops, const std::vector<Expr> &ids, bool is_commutative)
        : ops(ops), identities(ids), is_commutative(is_commutative) {
    }
    AssociativePattern(Expr op, Expr id, bool is_commutative)
        : ops({std::move(op)}), identities({std::move(id)}), is_commutative(is_commutative) {
    }

    bool operator==(const AssociativePattern &other) const {
        if ((is_commutative != other.is_commutative) || (ops.size() != other.ops.size())) {
            return false;
        }
        for (size_t i = 0; i < size(); ++i) {
            if (!equal(ops[i], other.ops[i]) || !equal(identities[i], other.identities[i])) {
                return false;
            }
        }
        return true;
    }
    bool operator!=(const AssociativePattern &other) const {
        return !(*this == other);
    }
    size_t size() const {
        return ops.size();
    }
    bool commutative() const {
        return is_commutative;
    }
};

const std::vector<AssociativePattern> &get_ops_table(const std::vector<Expr> &exprs);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ASSOCIATIVITY_H
#define HALIDE_ASSOCIATIVITY_H

/** \file
 *
 * Methods for extracting an associative operator from a Func's update definition
 * if there is any and computing the identity of the associative operator.
 */
#include <string>
#include <vector>


namespace Halide {
namespace Internal {

/**
 * Represent the equivalent associative op of an update definition.
 * For example, the following associative Expr, min(f(x), g(r.x) + 2),
 * where f(x) is the self-recurrence term, is represented as:
 \code
 AssociativeOp assoc(
    AssociativePattern(min(x, y), +inf, true),
    {Replacement("x", f(x))},
    {Replacement("y", g(r.x) + 2)},
    true
 );
 \endcode
 *
 * 'pattern' contains the list of equivalent binary/unary operators (+ identities)
 * for each Tuple element in the update definition. 'pattern' also contains
 * a boolean that indicates if the op is also commutative. 'xs' and 'ys'
 * contain the corresponding definition of each variable in the list of
 * binary operators.
 *
 * For unary operator, 'xs' is not set, i.e. it will be a pair of empty string
 * and undefined Expr: {"", Expr()}. 'pattern' will only contain the 'y' term in
 * this case. For example, min(g(r.x), 4), will be represented as:
 \code
 AssociativeOp assoc(
    AssociativePattern(y, 0, false),
    {Replacement("", Expr())},
    {Replacement("y", min(g(r.x), 4))},
    true
 );
 \endcode
 *
 * Self-assignment, f(x) = f(x), will be represented as:
 \code
 AssociativeOp assoc(
    AssociativePattern(x, 0, true),
    {Replacement("x", f(x))},
    {Replacement("", Expr())},
    true
 );
 \endcode
 * For both unary operator and self-assignment cases, the identity does not
 * matter. It can be anything.
 */
struct AssociativeOp {
    struct Replacement {
        /** Variable name that is used to replace the expr in 'op'. */
        std::string var;
        Expr expr;

        Replacement() = default;
        Replacement(const std::string &var, Expr expr)
            : var(var), expr(std::move(expr)) {
        }

        bool operator==(const Replacement &other) const {
            return (var == other.var) && equal(expr, other.expr);
        }
        bool operator!=(const Replacement &other) const {
            return !(*this == other);
        }
    };

    /** List of pairs of binary associative op and its identity. */
    AssociativePattern pattern;
    std::vector<Replacement> xs;
    std::vector<Replacement> ys;
    bool is_associative = false;

    AssociativeOp() = default;
    AssociativeOp(size_t size)
        : pattern(size), xs(size), ys(size) {
    }
    AssociativeOp(const AssociativePattern &p, const std::vector<Replacement> &xs,
                  const std::vector<Replacement> &ys, bool is_associative)
        : pattern(p), xs(xs), ys(ys), is_associative(is_associative) {
    }

    bool associative() const {
        return is_associative;
    }
    bool commutative() const {
        return pattern.is_commutative;
    }
    size_t size() const {
        return pattern.size();
    }
};

/**
 * Given an update definition of a Func 'f', determine its equivalent
 * associative binary/unary operator if there is any. 'is_associative'
 * indicates if the operation was successfuly proven as associative.
 */
AssociativeOp prove_associativity(
    const std::string &f, std::vector<Expr> args, std::vector<Expr> exprs);

void associativity_test();

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ASYNC_PRODUCERS_H
#define HALIDE_ASYNC_PRODUCERS_H

/** \file
 * Defines the lowering pass that injects task parallelism for producers that are scheduled as async.
 */
#include <map>
#include <string>


namespace Halide {
namespace Internal {

class Function;

Stmt fork_async_producers(Stmt s, const std::map<std::string, Function> &env);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_AUTO_SCHEDULE_UTILS_H
#define HALIDE_INTERNAL_AUTO_SCHEDULE_UTILS_H

/** \file
 *
 * Defines util functions that used by auto scheduler.
 */

#include <limits>
#include <set>

#ifndef HALIDE_IR_VISITOR_H
#define HALIDE_IR_VISITOR_H

#include <set>


/** \file
 * Defines the base class for things that recursively walk over the IR
 */

namespace Halide {
namespace Internal {

/** A base class for algorithms that need to recursively walk over the
 * IR. The default implementations just recursively walk over the
 * children. Override the ones you care about.
 */
class IRVisitor {
public:
    IRVisitor() = default;
    virtual ~IRVisitor() = default;

protected:
    // ExprNode<> and StmtNode<> are allowed to call visit (to implement accept())
    template<typename T>
    friend struct ExprNode;

    template<typename T>
    friend struct StmtNode;

    virtual void visit(const IntImm *);
    virtual void visit(const UIntImm *);
    virtual void visit(const FloatImm *);
    virtual void visit(const StringImm *);
    virtual void visit(const Cast *);
    virtual void visit(const Reinterpret *);
    virtual void visit(const Add *);
    virtual void visit(const Sub *);
    virtual void visit(const Mul *);
    virtual void visit(const Div *);
    virtual void visit(const Mod *);
    virtual void visit(const Min *);
    virtual void visit(const Max *);
    virtual void visit(const EQ *);
    virtual void visit(const NE *);
    virtual void visit(const LT *);
    virtual void visit(const LE *);
    virtual void visit(const GT *);
    virtual void visit(const GE *);
    virtual void visit(const And *);
    virtual void visit(const Or *);
    virtual void visit(const Not *);
    virtual void visit(const Select *);
    virtual void visit(const Load *);
    virtual void visit(const Ramp *);
    virtual void visit(const Broadcast *);
    virtual void visit(const Let *);
    virtual void visit(const LetStmt *);
    virtual void visit(const AssertStmt *);
    virtual void visit(const ProducerConsumer *);
    virtual void visit(const Store *);
    virtual void visit(const Provide *);
    virtual void visit(const Allocate *);
    virtual void visit(const Free *);
    virtual void visit(const Realize *);
    virtual void visit(const Block *);
    virtual void visit(const Fork *);
    virtual void visit(const IfThenElse *);
    virtual void visit(const Evaluate *);
    virtual void visit(const Call *);
    virtual void visit(const Variable *);
    virtual void visit(const For *);
    virtual void visit(const Acquire *);
    virtual void visit(const Shuffle *);
    virtual void visit(const Prefetch *);
    virtual void visit(const HoistedStorage *);
    virtual void visit(const Atomic *);
    virtual void visit(const VectorReduce *);
};

/** A lambda-based IR visitor that accepts multiple lambdas for different
 * node types. */
template<typename... Lambdas>
struct LambdaVisitor final : IRVisitor {
    explicit LambdaVisitor(Lambdas... lambdas)
        : handlers(std::move(lambdas)...) {
    }

    /** Public helper to call the base visitor from lambdas. */
    template<typename T>
    void visit_base(const T *op) {
        IRVisitor::visit(op);
    }

private:
    LambdaOverloads<Lambdas...> handlers;

    template<typename T>
    auto visit_impl(const T *op) {
        if constexpr (std::is_invocable_v<decltype(handlers), LambdaVisitor *, const T *>) {
            return handlers(this, op);
        } else {
            return this->visit_base(op);
        }
    }

protected:
    void visit(const IntImm *op) override {
        this->visit_impl(op);
    }
    void visit(const UIntImm *op) override {
        this->visit_impl(op);
    }
    void visit(const FloatImm *op) override {
        this->visit_impl(op);
    }
    void visit(const StringImm *op) override {
        this->visit_impl(op);
    }
    void visit(const Cast *op) override {
        this->visit_impl(op);
    }
    void visit(const Reinterpret *op) override {
        this->visit_impl(op);
    }
    void visit(const Add *op) override {
        this->visit_impl(op);
    }
    void visit(const Sub *op) override {
        this->visit_impl(op);
    }
    void visit(const Mul *op) override {
        this->visit_impl(op);
    }
    void visit(const Div *op) override {
        this->visit_impl(op);
    }
    void visit(const Mod *op) override {
        this->visit_impl(op);
    }
    void visit(const Min *op) override {
        this->visit_impl(op);
    }
    void visit(const Max *op) override {
        this->visit_impl(op);
    }
    void visit(const EQ *op) override {
        this->visit_impl(op);
    }
    void visit(const NE *op) override {
        this->visit_impl(op);
    }
    void visit(const LT *op) override {
        this->visit_impl(op);
    }
    void visit(const LE *op) override {
        this->visit_impl(op);
    }
    void visit(const GT *op) override {
        this->visit_impl(op);
    }
    void visit(const GE *op) override {
        this->visit_impl(op);
    }
    void visit(const And *op) override {
        this->visit_impl(op);
    }
    void visit(const Or *op) override {
        this->visit_impl(op);
    }
    void visit(const Not *op) override {
        this->visit_impl(op);
    }
    void visit(const Select *op) override {
        this->visit_impl(op);
    }
    void visit(const Load *op) override {
        this->visit_impl(op);
    }
    void visit(const Ramp *op) override {
        this->visit_impl(op);
    }
    void visit(const Broadcast *op) override {
        this->visit_impl(op);
    }
    void visit(const Let *op) override {
        this->visit_impl(op);
    }
    void visit(const LetStmt *op) override {
        this->visit_impl(op);
    }
    void visit(const AssertStmt *op) override {
        this->visit_impl(op);
    }
    void visit(const ProducerConsumer *op) override {
        this->visit_impl(op);
    }
    void visit(const Store *op) override {
        this->visit_impl(op);
    }
    void visit(const Provide *op) override {
        this->visit_impl(op);
    }
    void visit(const Allocate *op) override {
        this->visit_impl(op);
    }
    void visit(const Free *op) override {
        this->visit_impl(op);
    }
    void visit(const Realize *op) override {
        this->visit_impl(op);
    }
    void visit(const Block *op) override {
        this->visit_impl(op);
    }
    void visit(const Fork *op) override {
        this->visit_impl(op);
    }
    void visit(const IfThenElse *op) override {
        this->visit_impl(op);
    }
    void visit(const Evaluate *op) override {
        this->visit_impl(op);
    }
    void visit(const Call *op) override {
        this->visit_impl(op);
    }
    void visit(const Variable *op) override {
        this->visit_impl(op);
    }
    void visit(const For *op) override {
        this->visit_impl(op);
    }
    void visit(const Acquire *op) override {
        this->visit_impl(op);
    }
    void visit(const Shuffle *op) override {
        this->visit_impl(op);
    }
    void visit(const Prefetch *op) override {
        this->visit_impl(op);
    }
    void visit(const HoistedStorage *op) override {
        this->visit_impl(op);
    }
    void visit(const Atomic *op) override {
        this->visit_impl(op);
    }
    void visit(const VectorReduce *op) override {
        this->visit_impl(op);
    }
};

template<typename T, typename... Lambdas>
void visit_with(const T &ir, Lambdas &&...lambdas) {
    LambdaVisitor visitor{std::forward<Lambdas>(lambdas)...};
    ir.accept(&visitor);
}

/** A base class for algorithms that walk recursively over the IR
 * without visiting the same node twice. This is for passes that are
 * capable of interpreting the IR as a DAG instead of a tree. */
class IRGraphVisitor : public IRVisitor {
protected:
    /** By default these methods add the node to the visited set, and
     * return whether or not it was already there. If it wasn't there,
     * it delegates to the appropriate visit method. You can override
     * them if you like. */
    // @{
    virtual void include(const Expr &);
    virtual void include(const Stmt &);
    // @}

private:
    /** The nodes visited so far. Only includes nodes with a ref count greater
     * than one, because we know that nodes with a ref count of 1 will only be
     * visited once if their parents are only visited once. */
    std::set<const IRNode *> visited;

protected:
    /** These methods should call 'include' on the children to only
     * visit them if they haven't been visited already. */
    // @{
    void visit(const IntImm *) override;
    void visit(const UIntImm *) override;
    void visit(const FloatImm *) override;
    void visit(const StringImm *) override;
    void visit(const Cast *) override;
    void visit(const Reinterpret *) override;
    void visit(const Add *) override;
    void visit(const Sub *) override;
    void visit(const Mul *) override;
    void visit(const Div *) override;
    void visit(const Mod *) override;
    void visit(const Min *) override;
    void visit(const Max *) override;
    void visit(const EQ *) override;
    void visit(const NE *) override;
    void visit(const LT *) override;
    void visit(const LE *) override;
    void visit(const GT *) override;
    void visit(const GE *) override;
    void visit(const And *) override;
    void visit(const Or *) override;
    void visit(const Not *) override;
    void visit(const Select *) override;
    void visit(const Load *) override;
    void visit(const Ramp *) override;
    void visit(const Broadcast *) override;
    void visit(const Let *) override;
    void visit(const LetStmt *) override;
    void visit(const AssertStmt *) override;
    void visit(const ProducerConsumer *) override;
    void visit(const Store *) override;
    void visit(const Provide *) override;
    void visit(const Allocate *) override;
    void visit(const Free *) override;
    void visit(const Realize *) override;
    void visit(const Block *) override;
    void visit(const Fork *) override;
    void visit(const IfThenElse *) override;
    void visit(const Evaluate *) override;
    void visit(const Call *) override;
    void visit(const Variable *) override;
    void visit(const For *) override;
    void visit(const Acquire *) override;
    void visit(const Shuffle *) override;
    void visit(const Prefetch *) override;
    void visit(const HoistedStorage *) override;
    void visit(const Atomic *) override;
    void visit(const VectorReduce *) override;
    // @}
};

/** A visitor/mutator capable of passing arbitrary arguments to the
 * visit methods using CRTP and returning any types from them. All
 * Expr visitors must have the same signature, and all Stmt visitors
 * must have the same signature. Does not have default implementations
 * of the visit methods. */
template<typename T, typename ExprRet, typename StmtRet>
class VariadicVisitor {
private:
    template<typename... Args>
    ExprRet dispatch_expr(const BaseExprNode *node, Args &&...args) {
        if (node == nullptr) {
            return ExprRet{};
        }
        switch (node->node_type) {
        case IRNodeType::IntImm:
            return ((T *)this)->visit((const IntImm *)node, std::forward<Args>(args)...);
        case IRNodeType::UIntImm:
            return ((T *)this)->visit((const UIntImm *)node, std::forward<Args>(args)...);
        case IRNodeType::FloatImm:
            return ((T *)this)->visit((const FloatImm *)node, std::forward<Args>(args)...);
        case IRNodeType::StringImm:
            return ((T *)this)->visit((const StringImm *)node, std::forward<Args>(args)...);
        case IRNodeType::Broadcast:
            return ((T *)this)->visit((const Broadcast *)node, std::forward<Args>(args)...);
        case IRNodeType::Cast:
            return ((T *)this)->visit((const Cast *)node, std::forward<Args>(args)...);
        case IRNodeType::Reinterpret:
            return ((T *)this)->visit((const Reinterpret *)node, std::forward<Args>(args)...);
        case IRNodeType::Variable:
            return ((T *)this)->visit((const Variable *)node, std::forward<Args>(args)...);
        case IRNodeType::Add:
            return ((T *)this)->visit((const Add *)node, std::forward<Args>(args)...);
        case IRNodeType::Sub:
            return ((T *)this)->visit((const Sub *)node, std::forward<Args>(args)...);
        case IRNodeType::Mod:
            return ((T *)this)->visit((const Mod *)node, std::forward<Args>(args)...);
        case IRNodeType::Mul:
            return ((T *)this)->visit((const Mul *)node, std::forward<Args>(args)...);
        case IRNodeType::Div:
            return ((T *)this)->visit((const Div *)node, std::forward<Args>(args)...);
        case IRNodeType::Min:
            return ((T *)this)->visit((const Min *)node, std::forward<Args>(args)...);
        case IRNodeType::Max:
            return ((T *)this)->visit((const Max *)node, std::forward<Args>(args)...);
        case IRNodeType::EQ:
            return ((T *)this)->visit((const EQ *)node, std::forward<Args>(args)...);
        case IRNodeType::NE:
            return ((T *)this)->visit((const NE *)node, std::forward<Args>(args)...);
        case IRNodeType::LT:
            return ((T *)this)->visit((const LT *)node, std::forward<Args>(args)...);
        case IRNodeType::LE:
            return ((T *)this)->visit((const LE *)node, std::forward<Args>(args)...);
        case IRNodeType::GT:
            return ((T *)this)->visit((const GT *)node, std::forward<Args>(args)...);
        case IRNodeType::GE:
            return ((T *)this)->visit((const GE *)node, std::forward<Args>(args)...);
        case IRNodeType::And:
            return ((T *)this)->visit((const And *)node, std::forward<Args>(args)...);
        case IRNodeType::Or:
            return ((T *)this)->visit((const Or *)node, std::forward<Args>(args)...);
        case IRNodeType::Not:
            return ((T *)this)->visit((const Not *)node, std::forward<Args>(args)...);
        case IRNodeType::Select:
            return ((T *)this)->visit((const Select *)node, std::forward<Args>(args)...);
        case IRNodeType::Load:
            return ((T *)this)->visit((const Load *)node, std::forward<Args>(args)...);
        case IRNodeType::Ramp:
            return ((T *)this)->visit((const Ramp *)node, std::forward<Args>(args)...);
        case IRNodeType::Call:
            return ((T *)this)->visit((const Call *)node, std::forward<Args>(args)...);
        case IRNodeType::Let:
            return ((T *)this)->visit((const Let *)node, std::forward<Args>(args)...);
        case IRNodeType::Shuffle:
            return ((T *)this)->visit((const Shuffle *)node, std::forward<Args>(args)...);
        case IRNodeType::VectorReduce:
            return ((T *)this)->visit((const VectorReduce *)node, std::forward<Args>(args)...);
            // Explicitly list the Stmt types rather than using a
            // default case so that when new IR nodes are added we
            // don't miss them here.
        case IRNodeType::LetStmt:
        case IRNodeType::AssertStmt:
        case IRNodeType::ProducerConsumer:
        case IRNodeType::For:
        case IRNodeType::Acquire:
        case IRNodeType::Store:
        case IRNodeType::Provide:
        case IRNodeType::Allocate:
        case IRNodeType::Free:
        case IRNodeType::Realize:
        case IRNodeType::Block:
        case IRNodeType::Fork:
        case IRNodeType::IfThenElse:
        case IRNodeType::Evaluate:
        case IRNodeType::Prefetch:
        case IRNodeType::Atomic:
        case IRNodeType::HoistedStorage:
            internal_error << "Unreachable";
        }
        return ExprRet{};
    }

    template<typename... Args>
    StmtRet dispatch_stmt(const BaseStmtNode *node, Args &&...args) {
        if (node == nullptr) {
            return StmtRet{};
        }
        switch (node->node_type) {
        case IRNodeType::IntImm:
        case IRNodeType::UIntImm:
        case IRNodeType::FloatImm:
        case IRNodeType::StringImm:
        case IRNodeType::Broadcast:
        case IRNodeType::Cast:
        case IRNodeType::Reinterpret:
        case IRNodeType::Variable:
        case IRNodeType::Add:
        case IRNodeType::Sub:
        case IRNodeType::Mod:
        case IRNodeType::Mul:
        case IRNodeType::Div:
        case IRNodeType::Min:
        case IRNodeType::Max:
        case IRNodeType::EQ:
        case IRNodeType::NE:
        case IRNodeType::LT:
        case IRNodeType::LE:
        case IRNodeType::GT:
        case IRNodeType::GE:
        case IRNodeType::And:
        case IRNodeType::Or:
        case IRNodeType::Not:
        case IRNodeType::Select:
        case IRNodeType::Load:
        case IRNodeType::Ramp:
        case IRNodeType::Call:
        case IRNodeType::Let:
        case IRNodeType::Shuffle:
        case IRNodeType::VectorReduce:
            internal_error << "Unreachable";
            break;
        case IRNodeType::LetStmt:
            return ((T *)this)->visit((const LetStmt *)node, std::forward<Args>(args)...);
        case IRNodeType::AssertStmt:
            return ((T *)this)->visit((const AssertStmt *)node, std::forward<Args>(args)...);
        case IRNodeType::ProducerConsumer:
            return ((T *)this)->visit((const ProducerConsumer *)node, std::forward<Args>(args)...);
        case IRNodeType::For:
            return ((T *)this)->visit((const For *)node, std::forward<Args>(args)...);
        case IRNodeType::Acquire:
            return ((T *)this)->visit((const Acquire *)node, std::forward<Args>(args)...);
        case IRNodeType::Store:
            return ((T *)this)->visit((const Store *)node, std::forward<Args>(args)...);
        case IRNodeType::Provide:
            return ((T *)this)->visit((const Provide *)node, std::forward<Args>(args)...);
        case IRNodeType::Allocate:
            return ((T *)this)->visit((const Allocate *)node, std::forward<Args>(args)...);
        case IRNodeType::Free:
            return ((T *)this)->visit((const Free *)node, std::forward<Args>(args)...);
        case IRNodeType::Realize:
            return ((T *)this)->visit((const Realize *)node, std::forward<Args>(args)...);
        case IRNodeType::Block:
            return ((T *)this)->visit((const Block *)node, std::forward<Args>(args)...);
        case IRNodeType::Fork:
            return ((T *)this)->visit((const Fork *)node, std::forward<Args>(args)...);
        case IRNodeType::IfThenElse:
            return ((T *)this)->visit((const IfThenElse *)node, std::forward<Args>(args)...);
        case IRNodeType::Evaluate:
            return ((T *)this)->visit((const Evaluate *)node, std::forward<Args>(args)...);
        case IRNodeType::Prefetch:
            return ((T *)this)->visit((const Prefetch *)node, std::forward<Args>(args)...);
        case IRNodeType::Atomic:
            return ((T *)this)->visit((const Atomic *)node, std::forward<Args>(args)...);
        case IRNodeType::HoistedStorage:
            return ((T *)this)->visit((const HoistedStorage *)node, std::forward<Args>(args)...);
        }
        return StmtRet{};
    }

public:
    template<typename... Args>
    HALIDE_ALWAYS_INLINE StmtRet dispatch(const Stmt &s, Args &&...args) {
        return dispatch_stmt(s.get(), std::forward<Args>(args)...);
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE StmtRet dispatch(Stmt &&s, Args &&...args) {
        return dispatch_stmt(s.get(), std::forward<Args>(args)...);
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE ExprRet dispatch(const Expr &e, Args &&...args) {
        return dispatch_expr(e.get(), std::forward<Args>(args)...);
    }

    template<typename... Args>
    HALIDE_ALWAYS_INLINE ExprRet dispatch(Expr &&e, Args &&...args) {
        return dispatch_expr(e.get(), std::forward<Args>(args)...);
    }
};

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {
namespace Internal {

typedef std::map<std::string, Interval> DimBounds;

const int64_t unknown = std::numeric_limits<int64_t>::min();

/** Visitor for keeping track of functions that are directly called and the
 * arguments with which they are called. */
class FindAllCalls : public IRVisitor {
    using IRVisitor::visit;

    void visit(const Call *call) override {
        if (call->call_type == Call::Halide || call->call_type == Call::Image) {
            funcs_called.insert(call->name);
            call_args.emplace_back(call->name, call->args);
        }
        for (const auto &arg : call->args) {
            arg.accept(this);
        }
    }

public:
    std::set<std::string> funcs_called;
    std::vector<std::pair<std::string, std::vector<Expr>>> call_args;
};

/** Return an int representation of 's'. Throw an error on failure. */
int string_to_int(const std::string &s);

/** Substitute every variable in an Expr or a Stmt with its estimate
 * if specified. */
//@{
Expr substitute_var_estimates(Expr e);
Stmt substitute_var_estimates(Stmt s);
//@}

/** Return the size of an interval. Return an undefined expr if the interval
 * is unbounded. */
Expr get_extent(const Interval &i);

/** Return the size of an n-d box. */
Expr box_size(const Box &b);

/** Helper function to print the bounds of a region. */
void disp_regions(const std::map<std::string, Box> &regions);

/** Return the corresponding definition of a function given the stage. This
 * will throw an assertion if the function is an extern function (Extern Func
 * does not have definition). */
Definition get_stage_definition(const Function &f, int stage_num);

/** Return the corresponding loop dimensions of a function given the stage.
 * For extern Func, this will return a list of size 1 containing the
 * dummy __outermost loop dimension. */
std::vector<Dim> &get_stage_dims(const Function &f, int stage_num);

/** Add partial load costs to the corresponding function in the result costs. */
void combine_load_costs(std::map<std::string, Expr> &result,
                        const std::map<std::string, Expr> &partial);

/** Return the required bounds of an intermediate stage (f, stage_num) of
 * function 'f' given the bounds of the pure dimensions. */
DimBounds get_stage_bounds(const Function &f, int stage_num, const DimBounds &pure_bounds);

/** Return the required bounds for all the stages of the function 'f'. Each entry
 * in the returned vector corresponds to a stage. */
std::vector<DimBounds> get_stage_bounds(const Function &f, const DimBounds &pure_bounds);

/** Recursively inline all the functions in the set 'inlines' into the
 * expression 'e' and return the resulting expression. If 'order' is
 * passed, inlining will be done in the reverse order of function realization
 * to avoid extra inlining works. */
Expr perform_inline(Expr e, const std::map<std::string, Function> &env,
                    const std::set<std::string> &inlines = std::set<std::string>(),
                    const std::vector<std::string> &order = std::vector<std::string>());

/** Return all functions that are directly called by a function stage (f, stage). */
std::set<std::string> get_parents(Function f, int stage);

/** Return value of element within a map. This will assert if the element is not
 * in the map. */
// @{
template<typename K, typename V>
V get_element(const std::map<K, V> &m, const K &key) {
    const auto &iter = m.find(key);
    internal_assert(iter != m.end());
    return iter->second;
}

template<typename K, typename V>
V &get_element(std::map<K, V> &m, const K &key) {
    const auto &iter = m.find(key);
    internal_assert(iter != m.end());
    return iter->second;
}
// @}

/** If the cost of computing a Func is about the same as calling the Func,
 * inline the Func. Return true of any of the Funcs is inlined. */
bool inline_all_trivial_functions(const std::vector<Function> &outputs,
                                  const std::vector<std::string> &order,
                                  const std::map<std::string, Function> &env);

/** Determine if a Func (order[index]) is only consumed by another single Func
 * in element-wise manner. If it is, return the name of the consumer Func;
 * otherwise, return an empty string. */
std::string is_func_called_element_wise(const std::vector<std::string> &order, size_t index,
                                        const std::map<std::string, Function> &env);

/** Inline a Func if its values are only consumed by another single Func in
 * element-wise manner. */
bool inline_all_element_wise_functions(const std::vector<Function> &outputs,
                                       const std::vector<std::string> &order,
                                       const std::map<std::string, Function> &env);

void propagate_estimate_test();

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H
#define HALIDE_BOUND_CONSTANT_EXTENT_LOOPS_H

/** \file
 * Defines the lowering pass that enforces a constant extent on all
 * vectorized or unrolled loops.
 */


namespace Halide {
namespace Internal {

/** Replace all loop extents of unrolled or vectorized loops with constants, by
 * substituting and simplifying as needed. If we can't determine a constant
 * extent, but can determine a constant upper bound, inject an if statement into
 * the body. If we can't even determine a constant upper bound, throw a user
 * error. */
Stmt bound_constant_extent_loops(const Stmt &s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_BOUND_SMALL_ALLOCATIONS
#define HALIDE_BOUND_SMALL_ALLOCATIONS


/** \file
 * Defines the lowering pass that attempts to rewrite small
 * allocations to have constant size.
 */

namespace Halide {
namespace Internal {

/** \file
 *
 * Use bounds analysis to attempt to bound the sizes of small
 * allocations. Inside GPU kernels this is necessary in order to
 * compile. On the CPU this is also useful, because it prevents malloc
 * calls for (provably) tiny allocations. */
Stmt bound_small_allocations(const Stmt &s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_BOUNDARY_CONDITIONS_H
#define HALIDE_BOUNDARY_CONDITIONS_H

/** \file
 * Support for imposing boundary conditions on Halide::Funcs.
 */

#include <vector>

#ifndef HALIDE_LAMBDA_H
#define HALIDE_LAMBDA_H


/** \file
 * Convenience functions for creating small anonymous Halide
 * functions. See test/lambda.cpp for example usage. */

namespace Halide {

/** Create a zero-dimensional halide function that returns the given
 * expression. The function may have more dimensions if the expression
 * contains implicit arguments. */
Func lambda(const Expr &e);

/** Create a 1-D halide function in the first argument that returns
 * the second argument. The function may have more dimensions if the
 * expression contains implicit arguments and the list of Var
 * arguments contains a placeholder ("_"). */
Func lambda(const Var &x, const Expr &e);

/** Create a 2-D halide function in the first two arguments that
 * returns the last argument. The function may have more dimensions if
 * the expression contains implicit arguments and the list of Var
 * arguments contains a placeholder ("_"). */
Func lambda(const Var &x, const Var &y, const Expr &e);

/** Create a 3-D halide function in the first three arguments that
 * returns the last argument.  The function may have more dimensions
 * if the expression contains implicit arguments and the list of Var
 * arguments contains a placeholder ("_"). */
Func lambda(const Var &x, const Var &y, const Var &z, const Expr &e);

/** Create a 4-D halide function in the first four arguments that
 * returns the last argument. The function may have more dimensions if
 * the expression contains implicit arguments and the list of Var
 * arguments contains a placeholder ("_"). */
Func lambda(const Var &x, const Var &y, const Var &z, const Var &w, const Expr &e);

/** Create a 5-D halide function in the first five arguments that
 * returns the last argument. The function may have more dimensions if
 * the expression contains implicit arguments and the list of Var
 * arguments contains a placeholder ("_"). */
Func lambda(const Var &x, const Var &y, const Var &z, const Var &w, const Var &v, const Expr &e);

}  // namespace Halide

#endif  // HALIDE_LAMBDA_H

namespace Halide {

/** namespace to hold functions for imposing boundary conditions on
 *  Halide Funcs.
 *
 *  All functions in this namespace transform a source Func to a
 *  result Func where the result produces the values of the source
 *  within a given region and a different set of values outside the
 *  given region. A region is an N dimensional box specified by
 *  mins and extents.
 *
 *  Three areas are defined:
 *      The image is the entire set of values in the region.
 *      The edge is the set of pixels in the image but adjacent
 *          to coordinates that are not
 *      The interior is the image minus the edge (and is undefined
 *          if the extent of any region is 1 or less).
 *
 *  If the source Func has more dimensions than are specified, the extra ones
 *  are unmodified. Additionally, passing an undefined (default constructed)
 *  'Expr' for the min and extent of a dimension will keep that dimension
 *  unmodified.
 *
 *  Numerous options for specifing the outside area are provided,
 *  including replacement with an expression, repeating the edge
 *  samples, mirroring over the edge, and repeating or mirroring the
 *  entire image.
 *
 *  Using these functions to express your boundary conditions is highly
 *  recommended for correctness and performance. Some of these are hard
 *  to get right. The versions here are both understood by bounds
 *  inference, and also judiciously use the 'likely' intrinsic to minimize
 *  runtime overhead.
 *
 */
namespace BoundaryConditions {

namespace Internal {

inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
                                                      const Expr &a1, const Expr &a2) {
    collected_args.emplace_back(a1, a2);
}

template<typename... Args>
inline HALIDE_NO_USER_CODE_INLINE void collect_region(Region &collected_args,
                                                      const Expr &a1, const Expr &a2, Args &&...args) {
    collected_args.emplace_back(a1, a2);
    collect_region(collected_args, std::forward<Args>(args)...);
}

template<typename T>
Func func_like_to_func(T &&func_like) {
    if constexpr (std::is_same_v<std::decay_t<T>, Func>) {
        return std::forward<T>(func_like);
    } else {
        return lambda(_, func_like(_));
    }
}

}  // namespace Internal

/** Impose a boundary condition such that a given expression is returned
 *  everywhere outside the boundary. Generally the expression will be a
 *  constant, though the code currently allows accessing the arguments
 *  of source.
 *
 *  An ImageParam, Buffer<T>, or similar can be passed instead of a
 *  Func. If this is done and no bounds are given, the boundaries will
 *  be taken from the min and extent methods of the passed
 *  object. Note that objects are taken by mutable ref. Pipelines
 *  capture Buffers via mutable refs, because running a pipeline might
 *  alter the Buffer metadata (e.g. device allocation state).
 *
 *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_CLAMP_TO_BORDER
 *   and putting value in the border of the texture.)
 *
 *  You may pass undefined Exprs for dimensions that you do not wish
 *  to bound.
 */
// @{
Func constant_exterior(const Func &source, const Tuple &value,
                       const Region &bounds);
Func constant_exterior(const Func &source, const Expr &value,
                       const Region &bounds);

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value, const Region &bounds) {
    return constant_exterior(Internal::func_like_to_func(func_like), value, bounds);
}

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value, const Region &bounds) {
    return constant_exterior(Internal::func_like_to_func(func_like), value, bounds);
}

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value) {
    Region object_bounds;
    for (int i = 0; i < func_like.dimensions(); i++) {
        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
    }

    return constant_exterior(Internal::func_like_to_func(func_like), value, object_bounds);
}
template<typename T>
HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value) {
    return constant_exterior(func_like, Tuple(value));
}

template<typename T, typename... Bounds,
         std::enable_if_t<Halide::Internal::all_are_convertible<Expr, Bounds...>::value> * = nullptr>
HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Tuple &value,
                                                  Bounds &&...bounds) {
    Region collected_bounds;
    Internal::collect_region(collected_bounds, std::forward<Bounds>(bounds)...);
    return constant_exterior(Internal::func_like_to_func(func_like), value, collected_bounds);
}
template<typename T, typename... Bounds,
         std::enable_if_t<Halide::Internal::all_are_convertible<Expr, Bounds...>::value> * = nullptr>
HALIDE_NO_USER_CODE_INLINE Func constant_exterior(const T &func_like, const Expr &value,
                                                  Bounds &&...bounds) {
    return constant_exterior(func_like, Tuple(value), std::forward<Bounds>(bounds)...);
}
// @}

/** Impose a boundary condition such that the nearest edge sample is returned
 *  everywhere outside the given region.
 *
 *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
 *  is done and no bounds are given, the boundaries will be taken from the
 *  min and extent methods of the passed object.
 *
 *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_CLAMP_TO_EDGE.)
 *
 *  You may pass undefined Exprs for dimensions that you do not wish
 *  to bound.
 */
// @{
Func repeat_edge(const Func &source, const Region &bounds);

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like, const Region &bounds) {
    return repeat_edge(Internal::func_like_to_func(func_like), bounds);
}

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func repeat_edge(const T &func_like) {
    Region object_bounds;
    for (int i = 0; i < func_like.dimensions(); i++) {
        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
    }

    return repeat_edge(Internal::func_like_to_func(func_like), object_bounds);
}
// @}

/** Impose a boundary condition such that the entire coordinate space is
 *  tiled with copies of the image abutted against each other.
 *
 *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
 *  is done and no bounds are given, the boundaries will be taken from the
 *  min and extent methods of the passed object.
 *
 *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_REPEAT.)
 *
 *  You may pass undefined Exprs for dimensions that you do not wish
 *  to bound.
 */
// @{
Func repeat_image(const Func &source, const Region &bounds);

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like, const Region &bounds) {
    return repeat_image(Internal::func_like_to_func(func_like), bounds);
}

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func repeat_image(const T &func_like) {
    Region object_bounds;
    for (int i = 0; i < func_like.dimensions(); i++) {
        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
    }

    return repeat_image(Internal::func_like_to_func(func_like), object_bounds);
}

/** Impose a boundary condition such that the entire coordinate space is
 *  tiled with copies of the image abutted against each other, but mirror
 *  them such that adjacent edges are the same.
 *
 *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
 *  is done and no bounds are given, the boundaries will be taken from the
 *  min and extent methods of the passed object.
 *
 *  (This is similar to setting GL_TEXTURE_WRAP_* to GL_MIRRORED_REPEAT.)
 *
 *  You may pass undefined Exprs for dimensions that you do not wish
 *  to bound.
 */
// @{
Func mirror_image(const Func &source, const Region &bounds);

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like, const Region &bounds) {
    return mirror_image(Internal::func_like_to_func(func_like), bounds);
}

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func mirror_image(const T &func_like) {
    Region object_bounds;
    for (int i = 0; i < func_like.dimensions(); i++) {
        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
    }

    return mirror_image(Internal::func_like_to_func(func_like), object_bounds);
}

// @}

/** Impose a boundary condition such that the entire coordinate space is
 *  tiled with copies of the image abutted against each other, but mirror
 *  them such that adjacent edges are the same and then overlap the edges.
 *
 *  This produces an error if any extent is 1 or less. (TODO: check this.)
 *
 *  An ImageParam, Buffer<T>, or similar can be passed instead of a Func. If this
 *  is done and no bounds are given, the boundaries will be taken from the
 *  min and extent methods of the passed object.
 *
 *  (I do not believe there is a direct GL_TEXTURE_WRAP_* equivalent for this.)
 *
 *  You may pass undefined Exprs for dimensions that you do not wish
 *  to bound.
 */
// @{
Func mirror_interior(const Func &source, const Region &bounds);

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like, const Region &bounds) {
    return mirror_interior(Internal::func_like_to_func(func_like), bounds);
}

template<typename T>
HALIDE_NO_USER_CODE_INLINE Func mirror_interior(const T &func_like) {
    Region object_bounds;
    for (int i = 0; i < func_like.dimensions(); i++) {
        object_bounds.emplace_back(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent()));
    }

    return mirror_interior(Internal::func_like_to_func(func_like), object_bounds);
}

// @}

}  // namespace BoundaryConditions

}  // namespace Halide

#endif
#ifndef HALIDE_BOUNDS_INFERENCE_H
#define HALIDE_BOUNDS_INFERENCE_H

/** \file
 * Defines the bounds_inference lowering pass.
 */

#include <map>
#include <string>
#include <vector>


namespace Halide {

struct Target;

namespace Internal {

class Function;

/** Take a partially lowered statement that includes symbolic
 * representations of the bounds over which things should be realized,
 * and inject expressions defining those bounds.
 */
Stmt bounds_inference(Stmt,
                      const std::vector<Function> &outputs,
                      const std::vector<std::string> &realization_order,
                      const std::vector<std::vector<std::string>> &fused_groups,
                      const std::map<std::string, Function> &environment,
                      const std::map<std::pair<std::string, int>, Interval> &func_bounds,
                      const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CPLUSPLUS_MANGLE_H
#define HALIDE_CPLUSPLUS_MANGLE_H

/** \file
 *
 * A simple function to get a C++ mangled function name for a function.
 */
#include <string>
#include <vector>


namespace Halide {

struct ExternFuncArgument;
struct Target;

namespace Internal {

/** Return the mangled C++ name for a function.
 * The target parameter is used to decide on the C++
 * ABI/mangling style to use.
 */
std::string cplusplus_function_mangled_name(const std::string &name,
                                            const std::vector<std::string> &namespaces,
                                            Type return_type,
                                            const std::vector<ExternFuncArgument> &args,
                                            const Target &target);

void cplusplus_mangle_test();

}  // namespace Internal

}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_CSE_H
#define HALIDE_INTERNAL_CSE_H

/** \file
 * Defines a pass for introducing let expressions to wrap common sub-expressions. */


namespace Halide {
namespace Internal {

/** Replace each common sub-expression in the argument with a
 * variable, and wrap the resulting expr in a let statement giving a
 * value to that variable.
 *
 * This is important to do within Halide (instead of punting to llvm),
 * because exprs that come in from the front-end are small when
 * considered as a graph, but combinatorially large when considered as
 * a tree. For an example of a such a case, see
 * test/code_explosion.cpp
 *
 * The last parameter determines whether all common subexpressions are
 * lifted, or only those that the simplifier would not subsitute back
 * in (e.g. addition of a constant).
 */
Expr common_subexpression_elimination(const Expr &, bool lift_all = false);

/** Do common-subexpression-elimination on each expression in a
 * statement. Does not introduce let statements. */
Stmt common_subexpression_elimination(const Stmt &, bool lift_all = false);

void cse_test();

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CANONICALIZE_GPU_VARS_H
#define HALIDE_CANONICALIZE_GPU_VARS_H

/** \file
 * Defines the lowering pass that canonicalize the GPU var names over.
 */


namespace Halide {
namespace Internal {

/** Canonicalize GPU var names into some pre-determined block/thread names
 * (i.e. __block_id_x, __thread_id_x, etc.). The x/y/z/w order is determined
 * by the nesting order: innermost is assigned to x and so on. */
Stmt canonicalize_gpu_vars(Stmt s);

/** Names for the thread and block id variables. Includes the leading
 * dot. Indexed from inside out, so 0 gives you the innermost loop. */
// @{
const std::string &gpu_thread_name(int index);
const std::string &gpu_block_name(int index);
// @}

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CLAMPUNSAFEACCESSES_H
#define HALIDE_CLAMPUNSAFEACCESSES_H

/** \file
 * Defines the clamp_unsafe_accesses lowering pass.
 */


namespace Halide::Internal {

/** Inject clamps around func calls h(...) when all the following conditions hold:
 * 1. The call is in an indexing context, such as: f(x) = g(h(x));
 * 2. The FuncValueBounds of h are smaller than those of its type
 * 3. The allocation bounds of h might be wider than its compute bounds.
 */
Stmt clamp_unsafe_accesses(const Stmt &s, const std::map<std::string, Function> &env, FuncValueBounds &func_bounds);

}  // namespace Halide::Internal

#endif  // HALIDE_CLAMPUNSAFEACCESSES_H
#ifndef HALIDE_CLOSURE_H
#define HALIDE_CLOSURE_H

/** \file
 *
 * Provides Closure class.
 */
#include <map>
#include <string>


namespace Halide {

namespace Internal {

/** A helper class to manage closures. Walks over a statement and
 * retrieves all the references within it to external symbols
 * (variables and allocations). It then helps you build a struct
 * containing the current values of these symbols that you can use as
 * a closure if you want to migrate the body of the statement to its
 * own function (e.g. because it's the body of a parallel for loop. */
class Closure : public IRVisitor {
protected:
    Scope<> ignore;

    using IRVisitor::visit;

    void visit(const Let *op) override;
    void visit(const LetStmt *op) override;
    void visit(const For *op) override;
    void visit(const Load *op) override;
    void visit(const Store *op) override;
    void visit(const Allocate *op) override;
    void visit(const Variable *op) override;
    void visit(const Atomic *op) override;

public:
    /** Information about a buffer reference from a closure. */
    struct Buffer {
        /** The type of the buffer referenced. */
        Type type;

        /** The dimensionality of the buffer. */
        uint8_t dimensions = 0;

        /** The buffer is read from. */
        bool read = false;

        /** The buffer is written to. */
        bool write = false;

        /** The buffer is a texture */
        MemoryType memory_type = MemoryType::Auto;

        /** The size of the buffer if known, otherwise zero. */
        size_t size = 0;

        Buffer() = default;
    };

protected:
    void found_buffer_ref(const std::string &name, Type type,
                          bool read, bool written, const Halide::Buffer<> &image);

public:
    Closure() = default;

    // Movable but not copyable.
    Closure(const Closure &) = delete;
    Closure &operator=(const Closure &) = delete;
    Closure(Closure &&) = default;
    Closure &operator=(Closure &&) = default;

    /** Traverse a statement and find all references to external
     * symbols.
     *
     * When the closure encounters a read or write to 'foo', it
     * assumes that the host pointer is found in the symbol table as
     * 'foo.host', and any halide_buffer_t pointer is found under
     * 'foo.buffer'.
     *
     * Calling this multiple times (on multiple statements) is legal
     * (and will produce a unified closure).
     **/
    void include(const Stmt &s, const std::string &loop_variable = "");

    /** External variables referenced. There's code that assumes iterating over
     * this repeatedly gives a consistent order, so don't swap out the data type
     * for something non-deterministic. */
    std::map<std::string, Type> vars;

    /** External allocations referenced. */
    std::map<std::string, Buffer> buffers;

    /** Pack a closure into a struct. */
    Expr pack_into_struct() const;

    /** Unpack a closure around a Stmt, putting all the names in scope. */
    Stmt unpack_from_struct(const Expr &, const Stmt &) const;
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_C_H
#define HALIDE_CODEGEN_C_H

/** \file
 *
 * Defines an IRPrinter that emits C++ code equivalent to a halide stmt
 */

#ifndef HALIDE_IR_PRINTER_H
#define HALIDE_IR_PRINTER_H

/** \file
 * This header file defines operators that let you dump a Halide
 * expression, statement, or type directly into an output stream
 * in a human readable form.
 * E.g:
 \code
 Expr foo = ...
 std::cout << "Foo is " << foo << "\n";
 \endcode
 *
 * These operators are implemented using \ref Halide::Internal::IRPrinter
 */

#include <ostream>


namespace Halide {

/** Emit an expression on an output stream (such as std::cout) in
 * human-readable form */
std::ostream &operator<<(std::ostream &stream, const Expr &);

/** Emit a tuple on an output stream (such as std::cout) in
 * human-readable form */
std::ostream &operator<<(std::ostream &stream, const Tuple &);

/** Emit a halide type on an output stream (such as std::cout) in
 * human-readable form */
std::ostream &operator<<(std::ostream &stream, const Type &);

/** Emit a halide Module on an output stream (such as std::cout) in
 * human-readable form */
std::ostream &operator<<(std::ostream &stream, const Module &);

/** Emit a halide device api type in human-readable form */
std::ostream &operator<<(std::ostream &stream, const DeviceAPI &);

/** Emit a halide memory type in human-readable form */
std::ostream &operator<<(std::ostream &stream, const MemoryType &);

/** Emit a halide tail strategy in human-readable form */
std::ostream &operator<<(std::ostream &stream, const TailStrategy &);

/** Emit a halide loop partitioning policy in human-readable form */
std::ostream &operator<<(std::ostream &stream, const Partition &);

/** Emit a halide LoopLevel in human-readable form */
std::ostream &operator<<(std::ostream &stream, const LoopLevel &);

struct Target;
/** Emit a halide Target in a human readable form */
std::ostream &operator<<(std::ostream &stream, const Target &);

namespace Internal {

struct AssociativePattern;
struct AssociativeOp;
class Closure;
struct Interval;
struct ConstantInterval;
struct ModulusRemainder;
enum class IRNodeType;

/** Emit a halide node type on an output stream (such as std::cout) in
 * human-readable form */
std::ostream &operator<<(std::ostream &stream, IRNodeType);

/** Emit a halide associative pattern on an output stream (such as std::cout)
 * in a human-readable form */
std::ostream &operator<<(std::ostream &stream, const AssociativePattern &);

/** Emit a halide associative op on an output stream (such as std::cout) in a
 * human-readable form */
std::ostream &operator<<(std::ostream &stream, const AssociativeOp &);

/** Emit a halide statement on an output stream (such as std::cout) in
 * a human-readable form */
std::ostream &operator<<(std::ostream &stream, const Stmt &);

/** Emit a halide for loop type (vectorized, serial, etc) in a human
 * readable form */
std::ostream &operator<<(std::ostream &stream, const ForType &);

/** Emit a horizontal vector reduction op in human-readable form. */
std::ostream &operator<<(std::ostream &stream, const VectorReduce::Operator &);

/** Emit a halide name mangling value in a human readable format */
std::ostream &operator<<(std::ostream &stream, const NameMangling &);

/** Emit a halide LoweredFunc in a human readable format */
std::ostream &operator<<(std::ostream &stream, const LoweredFunc &);

/** Emit a halide linkage value in a human readable format */
std::ostream &operator<<(std::ostream &stream, const LinkageType &);

/** Emit a halide dimension type in human-readable format */
std::ostream &operator<<(std::ostream &stream, const DimType &);

/** Emit a Closure in human-readable form */
std::ostream &operator<<(std::ostream &out, const Closure &c);

/** Emit an Interval in human-readable form */
std::ostream &operator<<(std::ostream &out, const Interval &c);

/** Emit a ConstantInterval in human-readable form */
std::ostream &operator<<(std::ostream &out, const ConstantInterval &c);

/** Emit a ModulusRemainder in human-readable form */
std::ostream &operator<<(std::ostream &out, const ModulusRemainder &c);

struct Indentation {
    int indent;
};
std::ostream &operator<<(std::ostream &stream, const Indentation &);

template<typename T>
struct Ansi {
    const T &cnt;
    const char *open, *close;
};

template<typename T>
std::ostream &operator<<(std::ostream &out, const Ansi<T> &a) {
    if (a.open) {
        out << a.open;
    }
    out << a.cnt;
    if (a.close) {
        out << a.close;
    }
    return out;
}

/** An IRVisitor that emits IR to the given output stream in a human
 * readable form. Can be subclassed if you want to modify the way in
 * which it prints.
 */
class IRPrinter : public IRVisitor {
public:
    /** Construct an IRPrinter pointed at a given output stream
     * (e.g. std::cout, or a std::ofstream) */
    explicit IRPrinter(std::ostream &);

    /** emit an expression on the output stream */
    void print(const Expr &);

    /** Emit an expression on the output stream without enclosing parens */
    void print_no_parens(const Expr &);

    /** emit a statement on the output stream */
    void print(const Stmt &);

    /** emit a statement summary on the output stream */
    void print_summary(const Stmt &);

    /** emit a comma delimited list of exprs, without any leading or
     * trailing punctuation. */
    void print_list(const std::vector<Expr> &exprs);

    static void test();

protected:
    Indentation get_indent() const {
        return Indentation{indent};
    }

    /** The stream on which we're outputting */
    std::ostream &stream;

    /** The current indentation level, useful for pretty-printing
     * statements */
    int indent = 0;

    /** Certain expressions do not need parens around them, e.g. the
     * args to a call are already separated by commas and a
     * surrounding set of parens. */
    bool implicit_parens = false;

    /** Print only a summary of a statement, with sub-statements replaced by
     * ellipses (...). */
    bool is_summary = false;

    bool ansi = false;
    int paren_depth = 0;

    const char *ansi_hl = "";
    const char *ansi_dim = "";
    const char *ansi_kw = "";
    const char *ansi_imm_int = "";
    const char *ansi_imm_float = "";
    const char *ansi_imm_str = "";
    const char *ansi_var = "";
    const char *ansi_buf = "";
    const char *ansi_fn = "";
    const char *ansi_type = "";
    const char *ansi_reset_col = "";
    const char *ansi_reset = "";

    // clang-format off
    template<typename T> Ansi<T> hl(const T &t);
    template<typename T> Ansi<T> kw(const T &t);
    template<typename T> Ansi<T> imm_int(const T &t);
    template<typename T> Ansi<T> imm_float(const T &t);
    template<typename T> Ansi<T> imm_str(const T &t);
    template<typename T> Ansi<T> var(const T &t);
    template<typename T> Ansi<T> buf(const T &t);
    template<typename T> Ansi<T> fn(const T &t);
    template<typename T> Ansi<T> type(const T &t);
    template<typename T> Ansi<T> typep(const T &t);
    template<typename T> Ansi<T> paren(const T &t, bool bold = true, int d = -1);
    // clang-format on

    /** Either emits "(" or "", depending on the value of implicit_parens */
    void open();

    /** Either emits ")" or "", depending on the value of implicit_parens */
    void close();

    /** Emits "(" always */
    void openf();

    /** Emits "name(" always */
    void openf(const char *name);

    /** Emits ")" always */
    void closef();

    /** The symbols whose types can be inferred from values printed
     * already. */
    Scope<> known_type;

    /** A helper for printing a chain of lets with line breaks */
    void print_lets(const Let *let);

    /** A helper for printing a braced statement */
    void print_braced_stmt(const Stmt &, int extra_indent = 2);

    void visit(const IntImm *) override;
    void visit(const UIntImm *) override;
    void visit(const FloatImm *) override;
    void visit(const StringImm *) override;
    void visit(const Cast *) override;
    void visit(const Reinterpret *) override;
    void visit(const Variable *) override;
    void visit(const Add *) override;
    void visit(const Sub *) override;
    void visit(const Mul *) override;
    void visit(const Div *) override;
    void visit(const Mod *) override;
    void visit(const Min *) override;
    void visit(const Max *) override;
    void visit(const EQ *) override;
    void visit(const NE *) override;
    void visit(const LT *) override;
    void visit(const LE *) override;
    void visit(const GT *) override;
    void visit(const GE *) override;
    void visit(const And *) override;
    void visit(const Or *) override;
    void visit(const Not *) override;
    void visit(const Select *) override;
    void visit(const Load *) override;
    void visit(const Ramp *) override;
    void visit(const Broadcast *) override;
    void visit(const Call *) override;
    void visit(const Let *) override;
    void visit(const LetStmt *) override;
    void visit(const AssertStmt *) override;
    void visit(const ProducerConsumer *) override;
    void visit(const For *) override;
    void visit(const Acquire *) override;
    void visit(const Store *) override;
    void visit(const Provide *) override;
    void visit(const Allocate *) override;
    void visit(const Free *) override;
    void visit(const Realize *) override;
    void visit(const Block *) override;
    void visit(const Fork *) override;
    void visit(const IfThenElse *) override;
    void visit(const Evaluate *) override;
    void visit(const Shuffle *) override;
    void visit(const VectorReduce *) override;
    void visit(const Prefetch *) override;
    void visit(const Atomic *) override;
    void visit(const HoistedStorage *) override;
};

/** Debugging helpers for LLDB */
/// @{
std::string lldb_string(const Expr &);
std::string lldb_string(const Internal::BaseExprNode *);
std::string lldb_string(const Stmt &);
/// @}

}  // namespace Internal
}  // namespace Halide

#endif

#include <unordered_map>

namespace Halide {

struct Argument;
class Module;

namespace Internal {

struct LoweredFunc;

/** This class emits C++ code equivalent to a halide Stmt. It's
 * mostly the same as an IRPrinter, but it's wrapped in a function
 * definition, and some things are handled differently to be valid
 * C++.
 */
class CodeGen_C : public IRPrinter {
public:
    enum OutputKind {
        CHeader,
        CPlusPlusHeader,
        CImplementation,
        CPlusPlusImplementation,
        CExternDecl,
        CPlusPlusExternDecl,
        CPlusPlusFunctionInfoHeader,
    };

    /** Initialize a C code generator pointing at a particular output
     * stream (e.g. a file, or std::cout) */
    CodeGen_C(std::ostream &dest,
              const Target &target,
              OutputKind output_kind = CImplementation,
              const std::string &include_guard = "");
    ~CodeGen_C() override;

    /** Emit the declarations contained in the module as C code. */
    void compile(const Module &module);

    /** The target we're generating code for */
    const Target &get_target() const {
        return target;
    }

    static void test();

protected:
    enum class IntegerSuffixStyle {
        PlainC = 0,
        OpenCL = 1,
        HLSL = 2
    };

    /** How to emit 64-bit integer constants */
    IntegerSuffixStyle integer_suffix_style = IntegerSuffixStyle::PlainC;

    /** Emit a declaration. */
    // @{
    void compile(const LoweredFunc &func, const MetadataNameMap &metadata_name_map);
    void compile(const Buffer<> &buffer);
    // @}

    /** This is a hook that subclasses can use to transform a function body
     * just before it is emitted -- e.g., to transform the IR to code that
     * is easier to recognize and emit. The default implementation simply
     * returns the input unchanged.
     *
     * This hook will always be called after the function declaration and
     * opening brace is emitted, so in addition to (possibly) returning
     * a modified Stmt, this function may also emit C++ code to the default
     * stream if it wishes to add some prologue at the start of the function.
     */
    virtual Stmt preprocess_function_body(const Stmt &stmt);

    /** An ID for the most recently generated ssa variable */
    std::string id;

    /** The target being generated for. */
    Target target;

    /** Controls whether this instance is generating declarations or
     * definitions and whether the interface us extern "C" or C++. */
    OutputKind output_kind;

    /** A cache of generated values in scope */
    std::map<std::string, std::string> cache;

    /** Emit an expression as an assignment, then return the id of the
     * resulting var */
    std::string print_expr(const Expr &);

    /** Like print_expr, but cast the Expr to the given Type */
    std::string print_cast_expr(const Type &, const Expr &);

    /** Emit a statement */
    void print_stmt(const Stmt &);

    void create_assertion(const std::string &id_cond, const Expr &message);
    void create_assertion(const Expr &cond, const Expr &message);

    Expr scalarize_vector_reduce(const VectorReduce *op);
    enum AppendSpaceIfNeeded {
        DoNotAppendSpace,
        AppendSpace,
    };

    /** Emit the C name for a halide type. If space_option is AppendSpace,
     *  and there should be a space between the type and the next token,
     *  one is appended. (This allows both "int foo" and "Foo *foo" to be
     *  formatted correctly. Otherwise the latter is "Foo * foo".)
     */
    virtual std::string print_type(Type, AppendSpaceIfNeeded space_option = DoNotAppendSpace);

    /** Emit a statement to reinterpret an expression as another type */
    virtual std::string print_reinterpret(Type, const Expr &);

    /** Emit a version of a string that is a valid identifier in C (. is replaced with _) */
    virtual std::string print_name(const std::string &);

    /** Add platform specific prologue */
    virtual void add_platform_prologue();

    /** Add typedefs for vector types. Not needed for OpenCL, might
     * use different syntax for other C-like languages. */
    virtual void add_vector_typedefs(const std::set<Type> &vector_types);

    std::unordered_map<std::string, std::string> extern_function_name_map;

    /** Bottleneck to allow customization of calls to generic Extern/PureExtern calls.  */
    virtual std::string print_extern_call(const Call *op);

    /** Convert a vector Expr into a series of scalar Exprs, then reassemble into vector of original type.  */
    std::string print_scalarized_expr(const Expr &e);

    /** Emit an SSA-style assignment, and set id to the freshly generated name. Return id. */
    virtual std::string print_assignment(Type t, const std::string &rhs);

    /** Emit free for the heap allocation. **/
    void print_heap_free(const std::string &alloc_name);

    /** Return true if only generating an interface, which may be extern "C" or C++ */
    bool is_header() {
        return output_kind == CHeader ||
               output_kind == CPlusPlusHeader ||
               output_kind == CPlusPlusFunctionInfoHeader;
    }

    /** Return true if only generating an interface, which may be extern "C" or C++ */
    bool is_extern_decl() {
        return output_kind == CExternDecl ||
               output_kind == CPlusPlusExternDecl;
    }

    /** Return true if only generating an interface, which may be extern "C" or C++ */
    bool is_header_or_extern_decl() {
        return is_header() || is_extern_decl();
    }

    /** Return true if generating C++ linkage. */
    bool is_c_plus_plus_interface() {
        return output_kind == CPlusPlusHeader ||
               output_kind == CPlusPlusImplementation ||
               output_kind == CPlusPlusExternDecl ||
               output_kind == CPlusPlusFunctionInfoHeader;
    }

    /** Open a new C scope (i.e. throw in a brace, increase the indent) */
    void open_scope();

    /** Close a C scope (i.e. throw in an end brace, decrease the indent) */
    void close_scope(const std::string &comment);

    struct Allocation {
        Type type;
    };

    /** Track the types of allocations to avoid unnecessary casts. */
    Scope<Allocation> allocations;

    /** Track which allocations actually went on the heap. */
    Scope<> heap_allocations;

    /** True if there is a void * __user_context parameter in the arguments. */
    bool have_user_context;

    /** Track current calling convention scope. */
    bool extern_c_open = false;

    /** True if at least one gpu-based for loop is used. */
    bool uses_gpu_for_loops;

    /** Track which handle types have been forward-declared already. */
    std::set<const halide_handle_cplusplus_type *> forward_declared;

    /** If the Type is a handle type, emit a forward-declaration for it
     * if we haven't already. */
    void forward_declare_type_if_needed(const Type &t);

    void set_name_mangling_mode(NameMangling mode);

    using IRPrinter::visit;

    void visit(const Variable *) override;
    void visit(const IntImm *) override;
    void visit(const UIntImm *) override;
    void visit(const StringImm *) override;
    void visit(const FloatImm *) override;
    void visit(const Cast *) override;
    void visit(const Reinterpret *) override;
    void visit(const Add *) override;
    void visit(const Sub *) override;
    void visit(const Mul *) override;
    void visit(const Div *) override;
    void visit(const Mod *) override;
    void visit(const Max *) override;
    void visit(const Min *) override;
    void visit(const EQ *) override;
    void visit(const NE *) override;
    void visit(const LT *) override;
    void visit(const LE *) override;
    void visit(const GT *) override;
    void visit(const GE *) override;
    void visit(const And *) override;
    void visit(const Or *) override;
    void visit(const Not *) override;
    void visit(const Call *) override;
    void visit(const Select *) override;
    void visit(const Load *) override;
    void visit(const Store *) override;
    void visit(const Let *) override;
    void visit(const LetStmt *) override;
    void visit(const AssertStmt *) override;
    void visit(const ProducerConsumer *) override;
    void visit(const For *) override;
    void visit(const Ramp *) override;
    void visit(const Broadcast *) override;
    void visit(const Provide *) override;
    void visit(const Allocate *) override;
    void visit(const Free *) override;
    void visit(const Realize *) override;
    void visit(const IfThenElse *) override;
    void visit(const Evaluate *) override;
    void visit(const Shuffle *) override;
    void visit(const Prefetch *) override;
    void visit(const Fork *) override;
    void visit(const Acquire *) override;
    void visit(const Atomic *) override;
    void visit(const VectorReduce *) override;

    void visit_binop(Type t, const Expr &a, const Expr &b, const char *op);
    void visit_relop(Type t, const Expr &a, const Expr &b, const char *scalar_op, const char *vector_op);

    template<typename T>
    static std::string with_sep(const std::vector<T> &v, const std::string &sep) {
        std::ostringstream o;
        for (size_t i = 0; i < v.size(); ++i) {
            if (i > 0) {
                o << sep;
            }
            o << v[i];
        }
        return o.str();
    }

    template<typename T>
    static std::string with_commas(const std::vector<T> &v) {
        return with_sep<T>(v, ", ");
    }

    /** Are we inside an atomic node that uses mutex locks?
        This is used for detecting deadlocks from nested atomics. */
    bool inside_atomic_mutex_node = false;

    /** Emit atomic store instructions? */
    bool emit_atomic_stores = false;

    /** true if add_vector_typedefs() has been called. */
    bool using_vector_typedefs = false;

    /** Some architectures have private memory for the call stack; this
     * means a thread cannot hand pointers to stack memory to another
     * thread. Returning true here flag forces heap allocation of
     * things that might be shared, such as closures and any buffer
     * that may be used in a parallel context. */
    virtual bool is_stack_private_to_thread() const;

    void emit_argv_wrapper(const std::string &function_name,
                           const std::vector<LoweredArgument> &args);
    void emit_metadata_getter(const std::string &function_name,
                              const std::vector<LoweredArgument> &args,
                              const MetadataNameMap &metadata_name_map);
    void emit_constexpr_function_info(const std::string &function_name,
                                      const std::vector<LoweredArgument> &args,
                                      const MetadataNameMap &metadata_name_map);
    void emit_halide_free_helper(const std::string &alloc_name, const std::string &free_function);
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_D3D12_COMPUTE_DEV_H
#define HALIDE_CODEGEN_D3D12_COMPUTE_DEV_H

/** \file
 * Defines the code-generator for producing D3D12-compatible HLSL kernel code
 */

#include <memory>

namespace Halide {

struct Target;

namespace Internal {

struct CodeGen_GPU_Dev;

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_D3D12Compute_Dev(const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_GPU_DEV_H
#define HALIDE_CODEGEN_GPU_DEV_H

/** \file
 * Defines the code-generator interface for producing GPU device code
 */
#include <string>
#include <vector>

#ifndef HALIDE_DEVICE_ARGUMENT_H
#define HALIDE_DEVICE_ARGUMENT_H

/** \file
 * Defines helpers for passing arguments to separate devices, such as GPUs.
 */
#include <string>


namespace Halide {
namespace Internal {

/** A DeviceArgument looks similar to an Halide::Argument, but has behavioral
 * differences that make it specific to the GPU pipeline; the fact that
 * neither is-a nor has-a Halide::Argument is deliberate. In particular, note
 * that a Halide::Argument that is a buffer can be read or write, but not both,
 * while a DeviceArgument that is a buffer can be read *and* write for some GPU
 * backends. */
struct DeviceArgument {
    /** The name of the argument */
    std::string name;

    /** An argument is either a primitive type (for parameters), or a
     * buffer pointer.
     *
     * If is_buffer == false, then type fully encodes the expected type
     * of the scalar argument.
     *
     * If is_buffer == true, then type.bytes() should be used to determine
     * elem_size of the buffer; additionally, type.code *should* reflect
     * the expected interpretation of the buffer data (e.g. float vs int),
     * but there is no runtime enforcement of this at present.
     */
    bool is_buffer = false;

    /** If is_buffer == true and memory_type == GPUTexture, this argument should be
     * passed and accessed through texture sampler operations instead of
     * directly as a memory array
     */
    MemoryType memory_type = MemoryType::Auto;

    /** If is_buffer is true, this is the dimensionality of the buffer.
     * If is_buffer is false, this value is ignored (and should always be set to zero) */
    uint8_t dimensions = 0;

    /** If this is a scalar parameter, then this is its type.
     *
     * If this is a buffer parameter, this is used to determine elem_size
     * of the halide_buffer_t.
     *
     * Note that type.lanes() should always be 1 here. */
    Type type;

    /** The static size of the argument if known, or zero otherwise. */
    size_t size = 0;

    /** The index of the first element of the argument when packed into a wider
     * type, such as packing scalar floats into vec4 for GLSL. */
    size_t packed_index = 0;

    /** For buffers, these two variables can be used to specify whether the
     * buffer is read or written. By default, we assume that the argument
     * buffer is read-write and set both flags. */
    bool read = false;
    bool write = false;

    /** Alignment information for integer parameters. */
    ModulusRemainder alignment;

    DeviceArgument() = default;

    DeviceArgument(const std::string &_name,
                   bool _is_buffer,
                   MemoryType _mem,
                   Type _type,
                   uint8_t _dimensions,
                   size_t _size = 0)
        : name(_name),
          is_buffer(_is_buffer),
          memory_type(_mem),
          dimensions(_dimensions),
          type(_type),
          size(_size),

          read(_is_buffer),
          write(_is_buffer) {
    }
};

/** A Closure modified to inspect GPU-specific memory accesses, and
 * produce a vector of DeviceArgument objects. */
class HostClosure : public Closure {
public:
    HostClosure() = default;

    /** Get a description of the captured arguments. */
    std::vector<DeviceArgument> arguments();

protected:
    using Internal::Closure::visit;

    void visit(const For *loop) override;
    void visit(const Call *op) override;
};

}  // namespace Internal
}  // namespace Halide

#endif

namespace Halide {
namespace Internal {

/** A code generator that emits GPU code from a given Halide stmt. */
struct CodeGen_GPU_Dev {
    virtual ~CodeGen_GPU_Dev();

    /** Compile a GPU kernel into the module. This may be called many times
     * with different kernels, which will all be accumulated into a single
     * source module shared by a given Halide pipeline. */
    virtual void add_kernel(Stmt stmt,
                            const std::string &name,
                            const std::vector<DeviceArgument> &args) = 0;

    /** (Re)initialize the GPU kernel module. This is separate from compile,
     * since a GPU device module will often have many kernels compiled into it
     * for a single pipeline. */
    virtual void init_module() = 0;

    virtual std::vector<char> compile_to_src() = 0;

    virtual std::string get_current_kernel_name() = 0;

    virtual void dump() = 0;

    /** This routine returns the GPU API name that is combined into
     *  runtime routine names to ensure each GPU API has a unique
     *  name.
     */
    virtual std::string api_unique_name() = 0;

    /** Returns the specified name transformed by the variable naming rules
     * for the GPU language backend. Used to determine the name of a parameter
     * during host codegen. */
    virtual std::string print_gpu_name(const std::string &name) = 0;

    /** Allows the GPU device specific code to request halide_type_t
     * values to be passed to the kernel_run routine rather than just
     * argument type sizes.
     */
    virtual bool kernel_run_takes_types() const {
        return false;
    }

    /** Checks if expr is block uniform, i.e. does not depend on a thread
     * var. */
    static bool is_block_uniform(const Expr &expr);
    /** Checks if the buffer is a candidate for constant storage. Most
     * GPUs (APIs) support a constant memory storage class that cannot be
     * written to and performs well for block uniform accesses. A buffer is a
     * candidate for constant storage if it is never written to, and loads are
     * uniform within the workgroup. */
    static bool is_buffer_constant(const Stmt &kernel, const std::string &buffer);

    /** Modifies predicated loads and stores to be non-predicated, since most
     * GPU backends do not support predication. */
    static Stmt scalarize_predicated_loads_stores(Stmt &s);

    /** An mask describing which type of memory fence to use for the gpu_thread_barrier()
     * intrinsic.  Not all GPUs APIs support all types.
     */
    enum MemoryFenceType {
        None = 0,    // No fence required (just a sync)
        Device = 1,  // Device/global memory fence
        Shared = 2   // Threadgroup/shared memory fence
    };
};

/** A base class for GPU backends that require C-like shader output.
 * GPU backends derive from and specialize this class. */
class CodeGen_GPU_C : public CodeGen_C {
public:
    /** OpenCL and WGSL use different syntax than C for immediate vectors. This
    enum defines which style should be used by the backend. */
    enum class VectorDeclarationStyle {
        CLikeSyntax = 0,
        OpenCLSyntax = 1,
        WGSLSyntax = 2,
    };

    CodeGen_GPU_C(std::ostream &s, Target t)
        : CodeGen_C(s, t) {
    }

protected:
    using CodeGen_C::visit;
    void visit(const Shuffle *op) override;
    void visit(const Call *op) override;

    std::string print_extern_call(const Call *op) override;

    VectorDeclarationStyle vector_declaration_style = VectorDeclarationStyle::CLikeSyntax;
    bool abs_returns_unsigned_type{false};
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_INTERNAL_H
#define HALIDE_CODEGEN_INTERNAL_H

/** \file
 *
 * Defines functionality that's useful to multiple target-specific
 * CodeGen paths, but shouldn't live in CodeGen_LLVM.h (because that's the
 * front-end-facing interface to CodeGen).
 */

#include <memory>
#include <string>


namespace llvm {
class ConstantFolder;
class ElementCount;
class Function;
class IRBuilderDefaultInserter;
class LLVMContext;
class Module;
class StructType;
class TargetMachine;
class TargetOptions;
class Type;
class Value;
template<typename, typename>
class IRBuilder;
}  // namespace llvm

namespace Halide {

struct Target;

namespace Internal {

/** Get the scalar type of an llvm vector type. Returns the argument
 * if it's not a vector type. */
llvm::Type *get_vector_element_type(llvm::Type *);

/** Which built-in functions require a user-context first argument? */
bool function_takes_user_context(const std::string &name);

/** Given a size (in bytes), return True if the allocation size can fit
 * on the stack; otherwise, return False. This routine asserts if size is
 * non-positive. */
bool can_allocation_fit_on_stack(int64_t size);

/** Does a {div/mod}_round_to_zero using binary long division for int/uint.
 *  max_abs is the maximum absolute value of (a/b).
 *  Returns the pair {div_round_to_zero, mod_round_to_zero}. */
std::pair<Expr, Expr> long_div_mod_round_to_zero(const Expr &a, const Expr &b,
                                                 std::optional<uint64_t> max_abs = std::nullopt);

/** Given a Halide Euclidean division/mod operation, do constant optimizations
 * and possibly call lower_euclidean_div/lower_euclidean_mod if necessary.
 * Can introduce mulhi_shr and sorted_avg intrinsics as well as those from the
 * lower_euclidean_ operation -- div_round_to_zero or mod_round_to_zero. */
///@{
Expr lower_int_uint_div(const Expr &a, const Expr &b, bool round_to_zero = false);
Expr lower_int_uint_mod(const Expr &a, const Expr &b);
///@}

/** Given a Halide Euclidean division/mod operation, define it in terms of
 * div_round_to_zero or mod_round_to_zero. */
///@{
Expr lower_euclidean_div(Expr a, Expr b);
Expr lower_euclidean_mod(Expr a, Expr b);
///@}

/** Given a Halide shift operation with a signed shift amount (may be negative), define
 * an equivalent expression using only shifts by unsigned amounts. */
///@{
Expr lower_signed_shift_left(const Expr &a, const Expr &b);
Expr lower_signed_shift_right(const Expr &a, const Expr &b);
///@}

/** Reduce a mux intrinsic to a select tree */
Expr lower_mux(const Call *mux);

/** Reduce bit extraction and concatenation to bit ops */
///@{
Expr lower_extract_bits(const Call *c);
Expr lower_concat_bits(const Call *c);
///@}

/** An vectorizable implementation of Halide::round that doesn't depend on any
 * standard library being present. */
Expr lower_round_to_nearest_ties_to_even(const Expr &);

/** Given an llvm::Module, set llvm:TargetOptions information */
void get_target_options(const llvm::Module &module, llvm::TargetOptions &options);

/** Given two llvm::Modules, clone target options from one to the other */
void clone_target_options(const llvm::Module &from, llvm::Module &to);

/** Given an llvm::Module, get or create an llvm:TargetMachine */
std::unique_ptr<llvm::TargetMachine> make_target_machine(const llvm::Module &module);

/** Set the appropriate llvm Function attributes given the Halide Target. */
void set_function_attributes_from_halide_target_options(llvm::Function &);

/** Save a copy of the llvm IR currently represented by the module as
 * data in the __LLVM,__bitcode section. Emulates clang's
 * -fembed-bitcode flag and is useful to satisfy Apple's bitcode
 * inclusion requirements.  */
void embed_bitcode(llvm::Module *M, const std::string &halide_command);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_LLVM_H
#define HALIDE_CODEGEN_LLVM_H

/** \file
 *
 * Defines the base-class for all architecture-specific code
 * generators that use llvm.
 */

namespace llvm {
class Value;
class Module;
class Function;
class FunctionType;
class IRBuilderDefaultInserter;
class ConstantFolder;
template<typename, typename>
class IRBuilder;
class LLVMContext;
class Type;
class PointerType;
class StructType;
class Instruction;
class CallInst;
class ExecutionEngine;
class AllocaInst;
class Constant;
class Triple;
class MDNode;
class NamedMDNode;
class DataLayout;
class BasicBlock;
class GlobalVariable;
}  // namespace llvm

#include <map>
#include <memory>
#include <optional>
#include <string>
#include <variant>
#include <vector>


namespace Halide {

struct ExternSignature;

namespace Internal {

/** A code generator abstract base class. Actual code generators
 * (e.g. CodeGen_X86) inherit from this. This class is responsible
 * for taking a Halide Stmt and producing llvm bitcode, machine
 * code in an object file, or machine code accessible through a
 * function pointer.
 */
class CodeGen_LLVM : public IRVisitor {
public:
    /** Create an instance of CodeGen_LLVM suitable for the target. */
    static std::unique_ptr<CodeGen_LLVM> new_for_target(const Target &target, llvm::LLVMContext &context);

    /** Takes a halide Module and compiles it to an llvm Module. */
    virtual std::unique_ptr<llvm::Module> compile(const Module &module);

    /** The target we're generating code for */
    const Target &get_target() const {
        return target;
    }

    /** Tell the code generator which LLVM context to use. */
    void set_context(llvm::LLVMContext &context);

    /** Initialize internal llvm state for the enabled targets. */
    static void initialize_llvm();

    static std::unique_ptr<llvm::Module> compile_trampolines(
        const Target &target,
        llvm::LLVMContext &context,
        const std::string &suffix,
        const std::vector<std::pair<std::string, ExternSignature>> &externs);

    size_t get_requested_alloca_total() const {
        return requested_alloca_total;
    }

protected:
    CodeGen_LLVM(const Target &t);

    /** Compile a specific halide declaration into the llvm Module. */
    // @{
    virtual void compile_func(const LoweredFunc &func, const std::string &simple_name, const std::string &extern_name);
    virtual void compile_buffer(const Buffer<> &buffer);
    // @}

    /** Helper functions for compiling Halide functions to llvm
     * functions. begin_func performs all the work necessary to begin
     * generating code for a function with a given argument list with
     * the IRBuilder. A call to begin_func should be a followed by a
     * call to end_func with the same arguments, to generate the
     * appropriate cleanup code. */
    // @{
    virtual void begin_func(LinkageType linkage, const std::string &simple_name,
                            const std::string &extern_name, const std::vector<LoweredArgument> &args);
    virtual void end_func(const std::vector<LoweredArgument> &args);
    // @}

    /** What should be passed as -mcpu (warning: implies attrs!), -mattrs,
     *  and related for compilation. The architecture-specific code generator
     *  should define these.
     *
     *  `mcpu_target()` - target this specific CPU, in the sense of the allowed
     *  ISA sets *and* the CPU-specific tuning/assembly instruction scheduling.
     *
     *  `mcpu_tune()` - expect that we will be running on this specific CPU,
     *  so perform CPU-specific tuning/assembly instruction scheduling, *but*
     *  DON'T sacrifice the portability, support running on other CPUs, only
     *  make use of the ISAs that are enabled by `mcpu_target()`+`mattrs()`.
     */
    // @{
    virtual std::string mcpu_target() const = 0;
    virtual std::string mcpu_tune() const = 0;
    virtual std::string mattrs() const = 0;
    virtual std::string mabi() const;
    virtual bool use_soft_float_abi() const = 0;
    virtual bool use_pic() const;
    // @}

    /** Should indexing math be promoted to 64-bit on platforms with
     * 64-bit pointers? */
    virtual bool promote_indices() const {
        return true;
    }

    /** What's the natural vector bit-width to use for loads, stores, etc. */
    virtual int native_vector_bits() const = 0;

    /** Used to decide whether to break a vector up into multiple smaller
     * operations. This is the largest size the architecture supports. */
    virtual int maximum_vector_bits() const {
        return native_vector_bits();
    }
    /** For architectures that have vscale vectors, return the constant vscale to use.
     * Default of 0 means do not use vscale vectors. Generally will depend on
     * the target flags and vector_bits settings.
     */
    virtual int target_vscale() const {
        return 0;
    }

    /** Return the type in which arithmetic should be done for the
     * given storage type. */
    virtual Type upgrade_type_for_arithmetic(const Type &) const;

    /** Return the type that a given Halide type should be
     * stored/loaded from memory as. */
    virtual Type upgrade_type_for_storage(const Type &) const;

    /** Return the type that a Halide type should be passed in and out
     * of functions as. */
    virtual Type upgrade_type_for_argument_passing(const Type &) const;

    std::unique_ptr<llvm::Module> module;
    llvm::Function *function = nullptr;
    llvm::LLVMContext *context = nullptr;
    std::unique_ptr<llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>> builder;
    llvm::Value *value = nullptr;
    llvm::MDNode *very_likely_branch = nullptr;
    llvm::MDNode *fast_fp_math_md = nullptr;
    llvm::MDNode *strict_fp_math_md = nullptr;
    std::vector<LoweredArgument> current_function_args;

    bool in_strict_float = false;
    bool any_strict_float = false;

    /** Change floating-point math op emission to use fast flags. */
    void set_fast_fp_math();

    /** Change floating-point math op emission to use strict flags. */
    void set_strict_fp_math();

    /** If any_strict_float is true, sets fast math flags for the lifetime of
     * this object, then sets them to strict on destruction. If any_strict_float
     * is false, does nothing.  Any call to an IRBuilder method that starts with
     * "CreateF" should probably be wrapped in one of these, but it's safe to
     * miss one - we just miss out on some optimizations. In this way codegen is
     * designed to fail safe. */
    struct ScopedFastMath {
        CodeGen_LLVM *codegen;
        ScopedFastMath(CodeGen_LLVM *);
        ~ScopedFastMath();
    };

    /** The target we're generating code for */
    Halide::Target target;

    /** Grab all the context specific internal state. */
    virtual void init_context();
    /** Initialize the CodeGen_LLVM internal state to compile a fresh
     * module. This allows reuse of one CodeGen_LLVM object to compiled
     * multiple related modules (e.g. multiple device kernels). */
    virtual void init_module();

    /** Run all of llvm's optimization passes on the module. */
    void optimize_module();

    /** Add an entry to the symbol table, hiding previous entries with
     * the same name. Call this when new values come into scope. */
    void sym_push(const std::string &name, llvm::Value *value);

    /** Remove an entry for the symbol table, revealing any previous
     * entries with the same name. Call this when values go out of
     * scope. */
    void sym_pop(const std::string &name);

    /** Fetch an entry from the symbol table. If the symbol is not
     * found, it either errors out (if the second arg is true), or
     * returns nullptr. */
    llvm::Value *sym_get(const std::string &name,
                         bool must_succeed = true) const;

    /** Test if an item exists in the symbol table. */
    bool sym_exists(const std::string &name) const;

    /** Given a Halide ExternSignature, return the equivalent llvm::FunctionType. */
    llvm::FunctionType *signature_to_type(const ExternSignature &signature);

    /** Some useful llvm types */
    // @{
    llvm::Type *void_t = nullptr, *i1_t = nullptr, *i8_t = nullptr, *i16_t = nullptr, *i32_t = nullptr, *i64_t = nullptr, *f16_t = nullptr, *f32_t = nullptr, *f64_t = nullptr;
    llvm::PointerType *ptr_t = nullptr;
    llvm::StructType *halide_buffer_t_type = nullptr,
                     *type_t_type,
                     *dimension_t_type,
                     *metadata_t_type = nullptr,
                     *argument_t_type = nullptr,
                     *scalar_value_t_type = nullptr,
                     *device_interface_t_type = nullptr,
                     *pseudostack_slot_t_type = nullptr,
                     *semaphore_t_type;

    // @}

    /** Some wildcard variables used for peephole optimizations in
     * subclasses */
    // @{
    Expr wild_u1x_, wild_i8x_, wild_u8x_, wild_i16x_, wild_u16x_;
    Expr wild_i32x_, wild_u32x_, wild_i64x_, wild_u64x_;
    Expr wild_f32x_, wild_f64x_;

    // Wildcards for scalars.
    Expr wild_u1_, wild_i8_, wild_u8_, wild_i16_, wild_u16_;
    Expr wild_i32_, wild_u32_, wild_i64_, wild_u64_;
    Expr wild_f32_, wild_f64_;
    // @}

    /** Emit code that evaluates an expression, and return the llvm
     * representation of the result of the expression. */
    llvm::Value *codegen(const Expr &);

    /** Emit code that runs a statement. */
    void codegen(const Stmt &);

    /** Codegen a vector Expr by codegenning each lane and combining. */
    void scalarize(const Expr &);

    /** Some destructors should always be called. Others should only
     * be called if the pipeline is exiting with an error code. */
    enum DestructorType { Always,
                          OnError,
                          OnSuccess };

    /* Call this at the location of object creation to register how an
     * object should be destroyed. This does three things:
     * 1) Emits code here that puts the object in a unique
     * null-initialized stack slot
     * 2) Adds an instruction to the destructor block that calls the
     * destructor on that stack slot if it's not null.
     * 3) Returns that stack slot, so you can neuter the destructor
     * (by storing null to the stack slot) or destroy the object early
     * (by calling trigger_destructor).
     */
    llvm::Value *register_destructor(llvm::Function *destructor_fn, llvm::Value *obj, DestructorType when);

    /** Call a destructor early. Pass in the value returned by register destructor. */
    void trigger_destructor(llvm::Function *destructor_fn, llvm::Value *stack_slot);

    /** Retrieves the block containing the error handling
     * code. Creates it if it doesn't already exist for this
     * function. */
    llvm::BasicBlock *get_destructor_block();

    /** Codegen an assertion. If false, returns the error code (if not
     * null), or evaluates and returns the message, which must be an
     * Int(32) expression. */
    // @{
    void create_assertion(llvm::Value *condition, const Expr &message, llvm::Value *error_code = nullptr);
    // @}

    /** Codegen a block of asserts with pure conditions */
    void codegen_asserts(const std::vector<const AssertStmt *> &asserts);

    /** Return the the pipeline with the given error code. Will run
     * the destructor block. */
    void return_with_error_code(llvm::Value *error_code);

    /** Put a string constant in the module as a global variable and return a pointer to it. */
    llvm::Constant *create_string_constant(const std::string &str);

    /** Put a binary blob in the module as a global variable and return a pointer to it. */
    llvm::Constant *create_binary_blob(const std::vector<char> &data, const std::string &name, bool constant = true);

    /** Widen an llvm scalar into an llvm vector with the given number of lanes. */
    llvm::Value *create_broadcast(llvm::Value *, int lanes);

    /** Generate a pointer into a named buffer at a given index, of a
     * given type. The index counts according to the scalar type of
     * the type passed in. */
    // @{
    llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, llvm::Value *index);
    llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, Expr index);
    llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, Expr index);
    llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index);
    // @}

    /** Return type string for LLVM type using LLVM IR intrinsic type mangling.
     * E.g. ".i32 or ".f32" for scalars, ".p0" for pointers,
     * ".nxv4i32" for a scalable vector of four 32-bit integers,
     * or ".v4f32" for a fixed vector of four 32-bit floats.
     * The dot is included in the result.
     */
    std::string mangle_llvm_type(llvm::Type *type);

    /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */
    llvm::Value *make_halide_type_t(const Type &);

    /** Mark a load or store with type-based-alias-analysis metadata
     * so that llvm knows it can reorder loads and stores across
     * different buffers */
    void add_tbaa_metadata(llvm::Instruction *inst, std::string buffer, const Expr &index);

    /** Get a unique name for the actual block of memory that an
     * allocate node uses. Used so that alias analysis understands
     * when multiple Allocate nodes shared the same memory. */
    virtual std::string get_allocation_name(const std::string &n) {
        return n;
    }

    /** Add the appropriate function attribute to tell LLVM that the function
     * doesn't access memory. */
    void function_does_not_access_memory(llvm::Function *fn);

    using IRVisitor::visit;

    /** Generate code for various IR nodes. These can be overridden by
     * architecture-specific code to perform peephole
     * optimizations. The result of each is stored in \ref value */
    // @{
    void visit(const IntImm *) override;
    void visit(const UIntImm *) override;
    void visit(const FloatImm *) override;
    void visit(const StringImm *) override;
    void visit(const Cast *) override;
    void visit(const Reinterpret *) override;
    void visit(const Variable *) override;
    void visit(const Add *) override;
    void visit(const Sub *) override;
    void visit(const Mul *) override;
    void visit(const Div *) override;
    void visit(const Mod *) override;
    void visit(const Min *) override;
    void visit(const Max *) override;
    void visit(const EQ *) override;
    void visit(const NE *) override;
    void visit(const LT *) override;
    void visit(const LE *) override;
    void visit(const GT *) override;
    void visit(const GE *) override;
    void visit(const And *) override;
    void visit(const Or *) override;
    void visit(const Not *) override;
    void visit(const Select *) override;
    void visit(const Load *) override;
    void visit(const Ramp *) override;
    void visit(const Broadcast *) override;
    void visit(const Call *) override;
    void visit(const Let *) override;
    void visit(const LetStmt *) override;
    void visit(const AssertStmt *) override;
    void visit(const ProducerConsumer *) override;
    void visit(const For *) override;
    void visit(const Store *) override;
    void visit(const Block *) override;
    void visit(const IfThenElse *) override;
    void visit(const Evaluate *) override;
    void visit(const Shuffle *) override;
    void visit(const VectorReduce *) override;
    void visit(const Prefetch *) override;
    void visit(const Atomic *) override;
    // @}

    /** Generate code for an allocate node. It has no default
     * implementation - it must be handled in an architecture-specific
     * way. */
    void visit(const Allocate *) override = 0;

    /** Generate code for a free node. It has no default
     * implementation and must be handled in an architecture-specific
     * way. */
    void visit(const Free *) override = 0;

    /** These IR nodes should have been removed during
     * lowering. CodeGen_LLVM will error out if they are present */
    // @{
    void visit(const Provide *) override;
    void visit(const Realize *) override;
    // @}

    /** Get the llvm type equivalent to the given halide type in the
     * current context. */
    virtual llvm::Type *llvm_type_of(const Type &) const;

    /** Get the llvm type equivalent to a given halide type. If
     * effective_vscale is nonzero and the type is a vector type with lanes
     * a multiple of effective_vscale, a scalable vector type is generated
     * with total lanes divided by effective_vscale. That is a scalable
     * vector intended to be used with a fixed vscale of effective_vscale.
     */
    llvm::Type *llvm_type_of(llvm::LLVMContext *context, Halide::Type t,
                             int effective_vscale) const;

    /** Perform an alloca at the function entrypoint. Will be cleaned
     * on function exit. */
    llvm::Value *create_alloca_at_entry(llvm::Type *type, int n,
                                        bool zero_initialize = false,
                                        const std::string &name = "");

    /** A (very) conservative guess at the size of all alloca() storage requested
     * (including alignment padding). It's currently meant only to be used as
     * a very coarse way to ensure there is enough stack space when testing
     * on the WebAssembly backend.
     *
     * It is *not* meant to be a useful proxy for "stack space needed", for a
     * number of reasons:
     * - allocas with non-overlapping lifetimes will share space
     * - on some backends, LLVM may promote register-sized allocas into registers
     * - while this accounts for alloca() calls we know about, it doesn't attempt
     *   to account for stack spills, function call overhead, etc.
     */
    size_t requested_alloca_total = 0;

    /** The user_context argument. May be a constant null if the
     * function is being compiled without a user context. */
    llvm::Value *get_user_context() const;

    /** Implementation of the intrinsic call to
     * interleave_vectors. This implementation allows for interleaving
     * an arbitrary number of vectors.*/
    virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);

    /** Description of an intrinsic function overload. Overloads are resolved
     * using both argument and return types. The scalar types of the arguments
     * and return type must match exactly for an overload resolution to succeed. */
    struct Intrinsic {
        Type result_type;
        std::vector<Type> arg_types;
        llvm::Function *impl;

        Intrinsic(Type result_type, std::vector<Type> arg_types, llvm::Function *impl)
            : result_type(result_type), arg_types(std::move(arg_types)), impl(impl) {
        }
    };
    /** Mapping of intrinsic functions to the various overloads implementing it. */
    std::map<std::string, std::vector<Intrinsic>> intrinsics;

    /** Get an LLVM intrinsic declaration. If it doesn't exist, it will be created. */
    llvm::Function *get_llvm_intrin(const Type &ret_type, const std::string &name, const std::vector<Type> &arg_types, bool scalars_are_vectors = false);
    llvm::Function *get_llvm_intrin(llvm::Type *ret_type, const std::string &name, const std::vector<llvm::Type *> &arg_types);
    /** Declare an intrinsic function that participates in overload resolution. */
    llvm::Function *declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector<Type> arg_types, bool scalars_are_vectors = false);
    void declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector<Type> arg_types);
    /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found. */
    llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args);

    /** Generate a call to a vector intrinsic or runtime inlined
     * function. The arguments are sliced up into vectors of the width
     * given by 'intrin_lanes', the intrinsic is called on each
     * piece, then the results (if any) are concatenated back together
     * into the original type 't'. For the version that takes an
     * llvm::Type *, the type may be void, so the vector width of the
     * arguments must be specified explicitly as
     * 'called_lanes'. */
    // @{
    llvm::Value *call_intrin(const Type &t, int intrin_lanes,
                             const std::string &name, std::vector<Expr>);
    llvm::Value *call_intrin(const Type &t, int intrin_lanes,
                             llvm::Function *intrin, std::vector<Expr>);
    llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes,
                             const std::string &name, std::vector<llvm::Value *>,
                             bool scalable_vector_result = false, bool is_reduction = false);
    llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes,
                             llvm::Function *intrin, std::vector<llvm::Value *>,
                             bool is_reduction = false);
    // @}

    /** Take a slice of lanes out of an llvm vector. Pads with undefs
     * if you ask for more lanes than the vector has. */
    virtual llvm::Value *slice_vector(llvm::Value *vec, int start, int extent);

    /** Concatenate a bunch of llvm vectors. Must be of the same type. */
    virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);

    /** Reverse elements in a vector */
    llvm::Value *reverse_vector(llvm::Value *vec);

    /** Create an LLVM shuffle vectors instruction. Takes a combination of
     * fixed or scalable vectors as input, so long as the effective lengths match,
     * but always returns a fixed vector. */
    virtual llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
                                         const std::vector<int> &indices);
    /** Shorthand for shuffling a single vector. */
    llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);

    /** Go looking for a vector version of a runtime function. Will
     * return the best match. Matches in the following order:
     *
     * 1) The requested vector width.
     *
     * 2) The width which is the smallest power of two
     * greater than or equal to the vector width.
     *
     * 3) All the factors of 2) greater than one, in decreasing order.
     *
     * 4) The smallest power of two not yet tried.
     *
     * So for a 5-wide vector, it tries: 5, 8, 4, 2, 16.
     *
     * If there's no match, returns (nullptr, 0).
     */
    std::pair<llvm::Function *, int> find_vector_runtime_function(const std::string &name, int lanes);

    virtual bool supports_atomic_add(const Type &t) const;

    /** Compile a horizontal reduction that starts with an explicit
     * initial value. There are lots of complex ways to peephole
     * optimize this pattern, especially with the proliferation of
     * dot-product instructions, and they can usefully share logic
     * across backends. */
    virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init);

    /** Are we inside an atomic node that uses mutex locks?
        This is used for detecting deadlocks from nested atomics & illegal vectorization. */
    bool inside_atomic_mutex_node = false;

    /** Emit atomic store instructions? */
    bool emit_atomic_stores = false;

    /** Can we call this operation with float16 type?
        This is used to avoid "emulated" equivalent code-gen in case target has FP16 feature **/
    virtual bool supports_call_as_float16(const Call *op) const;

    /** call_intrin does far too much to be useful and generally breaks things
     * when one has carefully set things up for a specific architecture. This
     * just does the bare minimum. call_intrin should be refactored and could
     * call this, possibly with renaming of the methods. */
    llvm::Value *simple_call_intrin(const std::string &intrin,
                                    const std::vector<llvm::Value *> &args,
                                    llvm::Type *result_type);

    /** Ensure that a vector value is either fixed or vscale depending to match desired_type.
     */
    llvm::Value *normalize_fixed_scalable_vector_type(llvm::Type *desired_type, llvm::Value *result);

    /** Convert between two LLVM vectors of potentially different scalable/fixed and size.
     * Used to handle converting to/from fixed vectors that are smaller than the minimum
     * size scalable vector. */
    llvm::Value *convert_fixed_or_scalable_vector_type(llvm::Value *arg,
                                                       llvm::Type *desired_type);

    /** Convert an LLVM fixed vector value to the corresponding vscale vector value. */
    llvm::Value *fixed_to_scalable_vector_type(llvm::Value *fixed);

    /** Convert an LLVM vscale vector value to the corresponding fixed vector value. */
    llvm::Value *scalable_to_fixed_vector_type(llvm::Value *scalable);

    /** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */
    int get_vector_num_elements(const llvm::Type *t);

    /** Interface to abstract vector code generation as LLVM is now
     * providing multiple options to express even simple vector
     * operations. Specifically traditional fixed length vectors, vscale
     * based variable length vectors, and the vector predicate based approach
     * where an explict length is passed with each instruction.
     */
    // @{
    enum class VectorTypeConstraint {
        None,    /// Use default for current target.
        Fixed,   /// Force use of fixed size vectors.
        VScale,  /// For use of scalable vectors.
    };
    llvm::Type *get_vector_type(llvm::Type *, int n,
                                VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
    // @}

    llvm::Constant *get_splat(int lanes, llvm::Constant *value,
                              VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;

    /** Make sure a value type has the same scalable/fixed vector type as a guide. */
    // @{
    llvm::Value *match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint);
    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Type *guide);
    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Value *guide);
    // @}

    /** Support for generating LLVM vector predication intrinsics
     * ("@llvm.vp.*" and "@llvm.experimental.vp.*")
     */
    // @{
    /** Struct to hold descriptor for an argument to a vector
     *  predicated intrinsic. This includes the value, whether the
     *  type of the argument should be mangled into the intrisic name
     *  and if so, where, and the alignment for pointer arguments. */
    struct VPArg {
        llvm::Value *value;
        // If provided, put argument's type into the intrinsic name via LLVM IR type mangling.
        std::optional<size_t> mangle_index;
        int alignment;
        VPArg(llvm::Value *value, std::optional<size_t> mangle_index = std::nullopt, int32_t alignment = 0)
            : value(value), mangle_index(mangle_index), alignment(alignment) {
        }
    };

    /** Type indicating an intrinsic does not take a mask. */
    struct NoMask {
    };

    /** Type indicating mask to use is all true -- all lanes enabled. */
    struct AllEnabledMask {
    };

    /** Predication mask using the above two types for special cases
     *   and an llvm::Value for the general one. */
    using MaskVariant = std::variant<NoMask, AllEnabledMask, llvm::Value *>;

    /** Generate a vector predicated comparison intrinsic call if
     * use_llvm_vp_intrinsics is true and result_type is a vector
     * type. If generated, assigns result of vp intrinsic to value and
     * returns true if it an instuction is generated, otherwise
     * returns false. */
    bool try_vector_predication_comparison(const std::string &name, const Type &result_type,
                                           MaskVariant mask, llvm::Value *a, llvm::Value *b,
                                           const char *cmp_op);

    struct VPResultType {
        llvm::Type *type;
        std::optional<size_t> mangle_index;
        VPResultType(llvm::Type *type, std::optional<size_t> mangle_index = std::nullopt)
            : type(type), mangle_index(mangle_index) {
        }
    };

    /** Generate an intrisic call if use_llvm_vp_intrinsics is true
     * and length is greater than 1. If generated, assigns result
     * of vp intrinsic to value and returns true if it an instuction
     * is generated, otherwise returns false. */
    bool try_vector_predication_intrinsic(const std::string &name, VPResultType result_type,
                                          int32_t length, MaskVariant mask, std::vector<VPArg> args);

    /** Controls use of vector predicated intrinsics for vector operations.
     * Will be set by certain backends (e.g. RISC V) to control codegen. */
    bool use_llvm_vp_intrinsics = false;
    // @}

    /** Generate a basic dense vector load, with an optional predicate and
     * control over whether or not we should slice the load into native
     * vectors. Used by CodeGen_ARM to help with vld2/3/4 emission. */
    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);

    /** Warning messages which we want to avoid displaying number of times */
    enum class WarningKind {
        EmulatedFloat16,
    };
    std::map<WarningKind, std::string> onetime_warnings;

private:
    /** All the values in scope at the current code location during
     * codegen. Use sym_push and sym_pop to access. */
    Scope<llvm::Value *> symbol_table;

    /** String constants already emitted to the module. Tracked to
     * prevent emitting the same string many times. */
    std::map<std::string, llvm::Constant *> string_constants;

    /** A basic block to branch to on error that triggers all
     * destructors. As destructors are registered, code gets added
     * to this block. */
    llvm::BasicBlock *destructor_block = nullptr;

    /** Turn off all unsafe math flags in scopes while this is set. */
    bool strict_float;

    /** Use the LLVM large code model when this is set. */
    bool llvm_large_code_model;

    /** Cache the result of target_vscale from architecture specific implementation
     * as this is used on every Halide to LLVM type conversion.
     */
    int effective_vscale = 0;

    /** Assign a unique ID to each producer-consumer and for-loop node. The IDs
     * are printed as comments in assembly and used to link visualizations with
     * the generated assembly code within `StmtToViz`
     */
    int producer_consumer_id = 0;
    int for_loop_id = 0;

    /** Embed an instance of halide_filter_metadata_t in the code, using
     * the given name (by convention, this should be ${FUNCTIONNAME}_metadata)
     * as extern "C" linkage. Note that the return value is a function-returning-
     * pointer-to-constant-data.
     */
    llvm::Function *embed_metadata_getter(const std::string &metadata_getter_name,
                                          const std::string &function_name, const std::vector<LoweredArgument> &args,
                                          const MetadataNameMap &metadata_name_map);

    /** Embed a constant expression as a global variable. */
    llvm::Constant *embed_constant_expr(Expr e, llvm::Type *t);
    llvm::Constant *embed_constant_scalar_value_t(const Expr &e);

    llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name,
                                     bool result_in_argv, std::vector<bool> &arg_is_buffer);

    llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base,
                                     const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
                                     llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr);

    virtual void codegen_predicated_load(const Load *op);
    virtual void codegen_predicated_store(const Store *op);

    void codegen_atomic_rmw(const Store *op);

    void init_codegen(const std::string &name);
    std::unique_ptr<llvm::Module> finish_codegen();

    /** A helper routine for generating folded vector reductions. */
    template<typename Op>
    bool try_to_fold_vector_reduce(const Expr &a, Expr b);

    /** Records the StructType for pointer values returned from
     * make_struct intrinsic. Required for opaque pointer support.
     * This map should never grow without bound as each entry
     * represents a unique struct type created by a closure or similar.
     */
    std::map<llvm::Value *, llvm::Type *> struct_type_recovery;
};

}  // namespace Internal

/** Given a Halide module, generate an llvm::Module. */
std::unique_ptr<llvm::Module> codegen_llvm(const Module &module,
                                           llvm::LLVMContext &context);

}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_METAL_DEV_H
#define HALIDE_CODEGEN_METAL_DEV_H

/** \file
 * Defines the code-generator for producing Apple Metal shading language kernel code
 */

#include <memory>
#include <string>

namespace Halide {

struct Target;

/** Set the Metal compiler and linker commands to use for generating
 * precompiled Metal shaders (embedded metallibs instead of source code).
 * If both are set to non-empty strings, the Metal code generator will invoke
 * these tools to precompile shaders instead of embedding source code.
 * The compiler should typically be set to something like "xcrun -sdk macosx metal"
 * and the linker to "xcrun -sdk macosx metallib".
 * @param compiler_path The path/command for the Metal compiler
 * @param linker_path The path/command for the Metal linker
 */
void set_metal_compiler_and_linker(const std::string &compiler_path,
                                   const std::string &linker_path);

namespace Internal {

struct CodeGen_GPU_Dev;

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Metal_Dev(const Target &target);

/** Get the Metal compiler command that was set via set_metal_compiler_and_linker().
 * Returns empty string if not set. */
std::string get_metal_compiler();

/** Get the Metal linker command that was set via set_metal_compiler_and_linker().
 * Returns empty string if not set. */
std::string get_metal_linker();

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_OPENCL_DEV_H
#define HALIDE_CODEGEN_OPENCL_DEV_H

/** \file
 * Defines the code-generator for producing OpenCL C kernel code
 */

#include <memory>

namespace Halide {

struct Target;

namespace Internal {

struct CodeGen_GPU_Dev;

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_OpenCL_Dev(const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_PTX_DEV_H
#define HALIDE_CODEGEN_PTX_DEV_H

/** \file
 * Defines the code-generator for producing CUDA host code
 */

#include <memory>

namespace Halide {

struct Target;

namespace Internal {

struct CodeGen_GPU_Dev;

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_PTX_Dev(const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_POSIX_H
#define HALIDE_CODEGEN_POSIX_H

/** \file
 * Defines a base-class for code-generators on posixy cpu platforms
 */


namespace Halide {
namespace Internal {

/** A code generator that emits posix code from a given Halide stmt. */
class CodeGen_Posix : public CodeGen_LLVM {
public:
    /** Create an posix code generator. Processor features can be
     * enabled using the appropriate arguments */
    CodeGen_Posix(const Target &t);

protected:
    using CodeGen_LLVM::visit;

    /** Posix implementation of Allocate. Small constant-sized allocations go
     * on the stack. The rest go on the heap by calling "halide_malloc"
     * and "halide_free" in the standard library. */
    // @{
    void visit(const Allocate *) override;
    void visit(const Free *) override;
    // @}

    /** A struct describing heap or stack allocations. */
    struct Allocation {
        /** The memory */
        llvm::Value *ptr = nullptr;

        /** Destructor stack slot for this allocation. */
        llvm::Value *destructor = nullptr;

        /** Function to accomplish the destruction. */
        llvm::Function *destructor_function = nullptr;

        /** Pseudostack slot for this allocation. Non-null for
         * allocations of type Stack with dynamic size. */
        llvm::Value *pseudostack_slot = nullptr;

        /** The (Halide) type of the allocation. */
        Type type;

        /** How many bytes this allocation is, or 0 if not
         * constant. */
        int constant_bytes = 0;

        /** How many bytes of stack space used. 0 implies it was a
         * heap allocation. */
        int stack_bytes = 0;

        /** A unique name for this allocation. May not be equal to the
         * Allocate node name in cases where we detect multiple
         * Allocate nodes can share a single allocation. */
        std::string name;
    };

    /** The allocations currently in scope. The stack gets pushed when
     * we enter a new function. */
    Scope<Allocation> allocations;

    std::string get_allocation_name(const std::string &n) override;

private:
    /** Stack allocations that were freed, but haven't gone out of
     * scope yet.  This allows us to re-use stack allocations when
     * they aren't being used. */
    std::vector<Allocation> free_stack_allocs;

    /** current size of all alloca instances in use; this is tracked only
     * for debug output purposes. */
    size_t cur_stack_alloc_total{0};

    /** Generates code for computing the size of an allocation from a
     * list of its extents and its size. Fires a runtime assert
     * (halide_error) if the size overflows 2^31 -1, the maximum
     * positive number an int32_t can hold. */
    llvm::Value *codegen_allocation_size(const std::string &name, Type type, const std::vector<Expr> &extents, const Expr &condition);

    /** Allocates some memory on either the stack or the heap, and
     * returns an Allocation object describing it. For heap
     * allocations this calls halide_malloc in the runtime, and for
     * stack allocations it either reuses an existing block from the
     * free_stack_blocks list, or it saves the stack pointer and calls
     * alloca.
     *
     * This call returns the allocation, pushes it onto the
     * 'allocations' map, and adds an entry to the symbol table called
     * name.host that provides the base pointer.
     *
     * When the allocation can be freed call 'free_allocation', and
     * when it goes out of scope call 'destroy_allocation'. */
    Allocation create_allocation(const std::string &name, Type type, MemoryType memory_type,
                                 const std::vector<Expr> &extents, const Expr &condition,
                                 const Expr &new_expr, std::string free_function, int padding);

    /** Free an allocation previously allocated with
     * create_allocation */
    void free_allocation(const std::string &name);
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_PYTORCH_H
#define HALIDE_CODEGEN_PYTORCH_H

/** \file
 *
 * Defines an IRPrinter that emits C++ code that:
 * 1. wraps PyTorch's C++ tensor into Halide * buffers,
 * 2. calls the corresponding Halide operator.
 * 3. maps the output buffer back to a PyTorch tensor.
 *
 * The generated code checks for runtime errors and raises PyTorch exception
 * accordingly. It also makes sure the GPU device and stream are consistent when
 * the PyTorch input, when applicable.
 */


namespace Halide {

class Module;

namespace Internal {

struct LoweredFunc;

/** This class emits C++ code to wrap a Halide pipeline so that it can
 * be used as a C++ extension operator in PyTorch.
 */
class CodeGen_PyTorch : public IRPrinter {
public:
    CodeGen_PyTorch(std::ostream &dest);
    ~CodeGen_PyTorch() override = default;

    /** Emit the PyTorch C++ wrapper for the Halide pipeline. */
    void compile(const Module &module);

private:
    void compile(const LoweredFunc &func, bool is_cuda);
};

}  // namespace Internal
}  // namespace Halide

#endif  // HALIDE_CODEGEN_PYTORCH_H
#ifndef HALIDE_CODEGEN_TARGETS_H
#define HALIDE_CODEGEN_TARGETS_H

/** \file
 * Provides constructors for code generators for various targets.
 */

#include <memory>

namespace Halide {

struct Target;

namespace Internal {

class CodeGen_Posix;

/** Construct CodeGen object for a variety of targets. */
std::unique_ptr<CodeGen_Posix> new_CodeGen_ARM(const Target &target);
std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target);
std::unique_ptr<CodeGen_Posix> new_CodeGen_PowerPC(const Target &target);
std::unique_ptr<CodeGen_Posix> new_CodeGen_RISCV(const Target &target);
std::unique_ptr<CodeGen_Posix> new_CodeGen_X86(const Target &target);
std::unique_ptr<CodeGen_Posix> new_CodeGen_WebAssembly(const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_VULKAN_DEV_H
#define HALIDE_CODEGEN_VULKAN_DEV_H

/** \file
 * Defines the code-generator for producing SPIR-V binary modules for
 * use with the Vulkan runtime
 */

#include <memory>

namespace Halide {

struct Target;

namespace Internal {

struct CodeGen_GPU_Dev;

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_Vulkan_Dev(const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_CODEGEN_WEBGPU_DEV_H
#define HALIDE_CODEGEN_WEBGPU_DEV_H

/** \file
 * Defines the code-generator for producing WebGPU shader code (WGSL)
 */

#include <memory>

namespace Halide {

struct Target;

namespace Internal {

struct CodeGen_GPU_Dev;

std::unique_ptr<CodeGen_GPU_Dev> new_CodeGen_WebGPU_Dev(const Target &target);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_COMPILER_LOGGER_H_
#define HALIDE_COMPILER_LOGGER_H_

/** \file
 * Defines an interface used to gather and log compile-time information, stats, etc
 * for use in evaluating internal Halide compilation rules and efficiency.
 *
 * The 'standard' implementation simply logs all gathered data to
 * a local file (in JSON form), but the entire implementation can be
 * replaced by custom definitions if you have unusual logging needs.
 */

#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <utility>


namespace Halide {
namespace Internal {

class CompilerLogger {
public:
    /** The "Phase" of compilation, used for some calls */
    enum class Phase {
        HalideLowering,
        LLVM,
    };

    CompilerLogger() = default;
    virtual ~CompilerLogger() = default;

    /** Record when a particular simplifier rule matches.
     */
    virtual void record_matched_simplifier_rule(const std::string &rulename, Expr expr) = 0;

    /** Record when an expression is non-monotonic in a loop variable.
     */
    virtual void record_non_monotonic_loop_var(const std::string &loop_var, Expr expr) = 0;

    /** Record when can_prove() fails, but cannot find a counterexample.
     */
    virtual void record_failed_to_prove(Expr failed_to_prove, Expr original_expr) = 0;

    /** Record total size (in bytes) of final generated object code (e.g., file size of .o output).
     */
    virtual void record_object_code_size(uint64_t bytes) = 0;

    /** Record the compilation time (in seconds) for a given phase.
     */
    virtual void record_compilation_time(Phase phase, double duration) = 0;

    /**
     * Emit all the gathered data to the given stream. This may be called multiple times.
     */
    virtual std::ostream &emit_to_stream(std::ostream &o) = 0;
};

/** Set the active CompilerLogger object, replacing any existing one.
 * It is legal to pass in a nullptr (which means "don't do any compiler logging").
 * Returns the previous CompilerLogger (if any). */
std::unique_ptr<CompilerLogger> set_compiler_logger(std::unique_ptr<CompilerLogger> compiler_logger);

/** Return the currently active CompilerLogger object. If set_compiler_logger()
 * has never been called, a nullptr implementation will be returned.
 * Do not save the pointer returned! It is intended to be used for immediate
 * calls only. */
CompilerLogger *get_compiler_logger();

/** JSONCompilerLogger is a basic implementation of the CompilerLogger interface
 * that saves logged data, then logs it all in JSON format in emit_to_stream().
 */
class JSONCompilerLogger : public CompilerLogger {
public:
    JSONCompilerLogger() = default;

    JSONCompilerLogger(
        const std::string &generator_name,
        const std::string &function_name,
        const std::string &autoscheduler_name,
        const Target &target,
        const std::string &generator_args,
        bool obfuscate_exprs);

    void record_matched_simplifier_rule(const std::string &rulename, Expr expr) override;
    void record_non_monotonic_loop_var(const std::string &loop_var, Expr expr) override;
    void record_failed_to_prove(Expr failed_to_prove, Expr original_expr) override;
    void record_object_code_size(uint64_t bytes) override;
    void record_compilation_time(Phase phase, double duration) override;

    std::ostream &emit_to_stream(std::ostream &o) override;

protected:
    const std::string generator_name;
    const std::string function_name;
    const std::string autoscheduler_name;
    const Target target = Target();
    const std::string generator_args;
    const bool obfuscate_exprs{false};

    // Maps from string representing rewrite rule -> list of Exprs that matched that rule
    std::map<std::string, std::vector<Expr>> matched_simplifier_rules;

    // Maps loop_var -> list of Exprs that were nonmonotonic for that loop_var
    std::map<std::string, std::vector<Expr>> non_monotonic_loop_vars;

    // List of (unprovable simplified Expr, original version of that Expr passed to can_prove()).
    std::vector<std::pair<Expr, Expr>> failed_to_prove_exprs;

    // Total code size generated, in bytes.
    uint64_t object_code_size{0};

    // Map of the time take for each phase of compilation.
    std::map<Phase, double> compilation_time;

    void obfuscate();
    void emit();
};

}  // namespace Internal
}  // namespace Halide

#endif  // HALIDE_COMPILER_LOGGER_H_
#ifndef HALIDE_CONCISE_CASTS_H
#define HALIDE_CONCISE_CASTS_H


/** \file
 *
 * Defines concise cast and saturating cast operators to make it
 * easier to read cast-heavy code. Think carefully about the
 * readability implications before using these. They could make your
 * code better or worse. Often it's better to add extra Funcs to your
 * pipeline that do the upcasting and downcasting.
 */

namespace Halide {
namespace ConciseCasts {

inline Expr f64(Expr e) {
    Type t = Float(64, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr f32(Expr e) {
    Type t = Float(32, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr f16(Expr e) {
    Type t = Float(16, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr bf16(Expr e) {
    Type t = BFloat(16, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr i64(Expr e) {
    Type t = Int(64, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr i32(Expr e) {
    Type t = Int(32, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr i16(Expr e) {
    Type t = Int(16, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr i8(Expr e) {
    Type t = Int(8, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr u64(Expr e) {
    Type t = UInt(64, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr u32(Expr e) {
    Type t = UInt(32, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr u16(Expr e) {
    Type t = UInt(16, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr u8(Expr e) {
    Type t = UInt(8, e.type().lanes());
    return cast(t, std::move(e));
}

inline Expr i8_sat(Expr e) {
    Type t = Int(8, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr u8_sat(Expr e) {
    Type t = UInt(8, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr i16_sat(Expr e) {
    Type t = Int(16, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr u16_sat(Expr e) {
    Type t = UInt(16, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr i32_sat(Expr e) {
    Type t = Int(32, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr u32_sat(Expr e) {
    Type t = UInt(32, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr i64_sat(Expr e) {
    Type t = Int(64, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

inline Expr u64_sat(Expr e) {
    Type t = UInt(64, e.type().lanes());
    return saturating_cast(t, std::move(e));
}

};  // namespace ConciseCasts
};  // namespace Halide

#endif
#ifndef HALIDE_CONSTANT_BOUNDS_H
#define HALIDE_CONSTANT_BOUNDS_H


/** \file
 * Methods for computing compile-time constant int64_t upper and lower bounds of
 * an expression. Cheaper than symbolic bounds inference, and useful for things
 * like instruction selection.
 */

namespace Halide {
namespace Internal {

/** Deduce constant integer bounds on an expression. This can be useful to
 * decide if, for example, the expression can be cast to another type, be
 * negated, be incremented, etc without risking overflow.
 *
 * Also optionally accepts a scope containing the integer bounds of any
 * variables that may be referenced, and a cache of constant integer bounds on
 * known Exprs, which this function will update. The cache is helpful to
 * short-circuit large numbers of redundant queries, but it should not be used
 * in contexts where the same Expr object may take on different values within a
 * single Expr (i.e. before uniquify_variable_names).
 */
ConstantInterval constant_integer_bounds(const Expr &e,
                                         const Scope<ConstantInterval> &scope = Scope<ConstantInterval>::empty_scope(),
                                         std::map<Expr, ConstantInterval, ExprCompare> *cache = nullptr);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_DEBUG_ARGUMENTS_H
#define HALIDE_INTERNAL_DEBUG_ARGUMENTS_H


/** \file
 *
 * Defines a lowering pass that injects debug statements inside a
 * LoweredFunc. Intended to be used when Target::Debug is on.
 */

namespace Halide {
namespace Internal {

struct LoweredFunc;

/** Injects debug prints in a LoweredFunc that describe the target and
 * arguments. Mutates the given func. */
void debug_arguments(LoweredFunc *func, const Target &t);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_DEBUG_TO_FILE_H
#define HALIDE_DEBUG_TO_FILE_H

/** \file
 * Defines the lowering pass that injects code at the end of
 * every realization to dump functions to a file for debugging.  */

#include <map>
#include <vector>


namespace Halide {
namespace Internal {

class Function;

/** Takes a statement with Realize nodes still unlowered. If the
 * corresponding functions have a debug_file set, then inject code
 * that will dump the contents of those functions to a file after the
 * realization. */
Stmt debug_to_file(Stmt s,
                   const std::vector<Function> &outputs,
                   const std::map<std::string, Function> &env);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef DEINTERLEAVE_H
#define DEINTERLEAVE_H

/** \file
 *
 * Defines methods for splitting up a vector into the even lanes and
 * the odd lanes. Useful for optimizing expressions such as select(x %
 * 2, f(x/2), g(x/2))
 */


namespace Halide {
namespace Internal {

/** Extract the odd-numbered lanes in a vector */
Expr extract_odd_lanes(const Expr &a);

/** Extract the even-numbered lanes in a vector */
Expr extract_even_lanes(const Expr &a);

/** Extract the nth lane of a vector */
Expr extract_lane(const Expr &vec, int lane);

/** Look through a statement for expressions of the form select(ramp %
 * 2 == 0, a, b) and replace them with calls to an interleave
 * intrinsic */
Stmt rewrite_interleavings(const Stmt &s);

void deinterleave_vector_test();

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_DERIVATIVE_H
#define HALIDE_DERIVATIVE_H

/** \file
 *  Automatic differentiation
 */


#include <map>
#include <string>
#include <vector>

namespace Halide {

/**
 *  Helper structure storing the adjoints Func.
 *  Use d(func) or d(buffer) to obtain the derivative Func.
 */
class Derivative {
public:
    // function name & update_id, for initialization update_id == -1
    using FuncKey = std::pair<std::string, int>;

    explicit Derivative(const std::map<FuncKey, Func> &adjoints_in)
        : adjoints(adjoints_in) {
    }
    explicit Derivative(std::map<FuncKey, Func> &&adjoints_in)
        : adjoints(std::move(adjoints_in)) {
    }

    // These all return an undefined Func if no derivative is found
    // (typically, if the input Funcs aren't differentiable)
    Func operator()(const Func &func, int update_id = -1) const;
    Func operator()(const Buffer<> &buffer) const;
    Func operator()(const Param<> &param) const;
    Func operator()(const std::string &name) const;

private:
    const std::map<FuncKey, Func> adjoints;
};

/**
 *  Given a Func and a corresponding adjoint, (back)propagate the
 *  adjoint to all dependent Funcs, buffers, and parameters.
 *  The bounds of output and adjoint need to be specified with pair {min, extent}
 *  For each Func the output depends on, and for the pure definition and
 *  each update of that Func, it generates a derivative Func stored in
 *  the Derivative.
 */
Derivative propagate_adjoints(const Func &output,
                              const Func &adjoint,
                              const Region &output_bounds);
/**
 *  Given a Func and a corresponding adjoint buffer, (back)propagate the
 *  adjoint to all dependent Funcs, buffers, and parameters.
 *  For each Func the output depends on, and for the pure definition and
 *  each update of that Func, it generates a derivative Func stored in
 *  the Derivative.
 */
Derivative propagate_adjoints(const Func &output,
                              const Buffer<float> &adjoint);
/**
 *  Given a scalar Func with size 1, (back)propagate the gradient
 *  to all dependent Funcs, buffers, and parameters.
 *  For each Func the output depends on, and for the pure definition and
 *  each update of that Func, it generates a derivative Func stored in
 *  the Derivative.
 */
Derivative propagate_adjoints(const Func &output);

}  // namespace Halide

#endif
#ifndef HALIDE_INTERNAL_DERIVATIVE_UTILS_H
#define HALIDE_INTERNAL_DERIVATIVE_UTILS_H

#include <set>


namespace Halide {
namespace Internal {

/**
 * Remove all let definitions of expr
 */
Expr remove_let_definitions(const Expr &expr);

/**
 * Return a list of variables' indices that expr depends on and are in the filter
 */
std::vector<int> gather_variables(const Expr &expr,
                                  const std::vector<std::string> &filter);
std::vector<int> gather_variables(const Expr &expr,
                                  const std::vector<Var> &filter);

/**
 * Return a list of reduction variables the expression or tuple depends on
 */
struct ReductionVariableInfo {
    Expr min, extent;
    int index;
    ReductionDomain domain;
    std::string name;
};
std::map<std::string, ReductionVariableInfo> gather_rvariables(const Expr &expr);
std::map<std::string, ReductionVariableInfo> gather_rvariables(const Tuple &tuple);
/**
 * Add necessary let expressions to expr
 */
Expr add_let_expression(const Expr &expr,
                        const std::map<std::string, Expr> &let_var_mapping,
                        const std::vector<std::string> &let_variables);
/**
 * Topologically sort the expression graph expressed by expr
 */
std::vector<Expr> sort_expressions(const Expr &expr);
/**
 * Compute the bounds of funcs. The bounds represent a conservative region
 * that is used by the "consumers" of the function, except of itself.
 */
std::map<std::string, Box> inference_bounds(const std::vector<Func> &funcs,
                                            const std::vector<Box> &output_bounds);
std::map<std::string, Box> inference_bounds(const Func &func,
                                            const Box &output_bounds);
/**
 * Convert Box to vector of (min, extent)
 */
std::vector<std::pair<Expr, Expr>> box_to_vector(const Box &bounds);
/**
 * Return true if bounds0 and bounds1 represent the same bounds.
 */
bool equal(const RDom &bounds0, const RDom &bounds1);
/**
 * Return a list of variable names
 */
std::vector<std::string> vars_to_strings(const std::vector<Var> &vars);
/**
 * Return the reduction domain used by expr
 */
ReductionDomain extract_rdom(const Expr &expr);
/**
 * expr is new_var == f(var), solve for var == g(new_var)
 * if multiple new_var corresponds to same var, introduce a RDom
 */
std::pair<bool, Expr> solve_inverse(Expr expr,
                                    const std::string &new_var,
                                    const std::string &var);
/**
 * Find all calls to image buffers and parameters in the function
 */
struct BufferInfo {
    int dimension;
    Type type;
};
std::map<std::string, BufferInfo> find_buffer_param_calls(const Func &func);
/**
 * Find all implicit variables in expr
 */
std::set<std::string> find_implicit_variables(const Expr &expr);
/**
 * Substitute the variable. Also replace all occurrences in rdom.where() predicates.
 */
Expr substitute_rdom_predicate(
    const std::string &name, const Expr &replacement, const Expr &expr);

/**
 * Return true if expr contains call to func_name
 */
bool is_calling_function(
    const std::string &func_name, const Expr &expr,
    const std::map<std::string, Expr> &let_var_mapping);
/**
 * Return true if expr depends on any function or buffer
 */
bool is_calling_function(
    const Expr &expr,
    const std::map<std::string, Expr> &let_var_mapping);

/**
 * Replaces call to Func f in Expr e such that the call argument at variable_id
 * is the pure argument.
 */
Expr substitute_call_arg_with_pure_arg(Func f,
                                       int variable_id,
                                       const Expr &e);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_DESERIALIZATION_H
#define HALIDE_DESERIALIZATION_H

#include <istream>
#include <string>

namespace Halide {

/// @brief Deserialize a Halide pipeline from a file.
/// @param filename The location of the file to deserialize.  Must use .hlpipe extension.
/// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead).
/// @return Returns a newly constructed deserialized Pipeline object/
Pipeline deserialize_pipeline(const std::string &filename, const std::map<std::string, Parameter> &user_params);

/// @brief Deserialize a Halide pipeline from an input stream.
/// @param in The input stream to read from containing a serialized Halide pipeline
/// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead).
/// @return Returns a newly constructed deserialized Pipeline object/
Pipeline deserialize_pipeline(std::istream &in, const std::map<std::string, Parameter> &user_params);

/// @brief Deserialize a Halide pipeline from a byte buffer containing a serialized pipeline in binary format
/// @param data The data buffer containing a serialized Halide pipeline
/// @param user_params Map of named input/output parameters to bind with the resulting pipeline (used to avoid deserializing specific objects and enable the use of externally defined ones instead).
/// @return Returns a newly constructed deserialized Pipeline object/
Pipeline deserialize_pipeline(const std::vector<uint8_t> &data, const std::map<std::string, Parameter> &user_params);

/// @brief Deserialize the external parameters for the Halide pipeline from a file.
///        This method allows a minimal deserialization of just the external pipeline parameters, so they can be
///        remapped and overridden with user parameters prior to deserializing the pipeline definition.
/// @param filename The location of the file to deserialize.  Must use .hlpipe extension.
/// @return Returns a map containing the names and description of external parameters referenced in the pipeline
std::map<std::string, Parameter> deserialize_parameters(const std::string &filename);

/// @brief Deserialize the external parameters for the Halide pipeline from input stream.
///        This method allows a minimal deserialization of just the external pipeline parameters, so they can be
///        remapped and overridden with user parameters prior to deserializing the pipeline definition.
/// @param in The input stream to read from containing a serialized Halide pipeline
/// @return Returns a map containing the names and description of external parameters referenced in the pipeline
std::map<std::string, Parameter> deserialize_parameters(std::istream &in);

/// @brief Deserialize the external parameters for the Halide pipeline from a byte buffer containing a serialized
///        pipeline in binary format.  This method allows a minimal deserialization of just the external pipeline
///        parameters, so they can be remapped and overridden with user parameters prior to deserializing the
///        pipeline definition.
/// @param data The data buffer containing a serialized Halide pipeline
/// @return Returns a map containing the names and description of external parameters referenced in the pipeline
std::map<std::string, Parameter> deserialize_parameters(const std::vector<uint8_t> &data);

}  // namespace Halide

#endif
#ifndef HALIDE_DIMENSION_H
#define HALIDE_DIMENSION_H

/** \file
 * Defines the Dimension utility class for Halide pipelines
 */

#include <utility>


namespace Halide {
namespace Internal {

class Dimension {
public:
    /** Get an expression representing the minimum coordinates of this image
     * parameter in the given dimension. */
    Expr min() const;

    /** Get an expression representing the extent of this image
     * parameter in the given dimension */
    Expr extent() const;

    /** Get an expression representing the maximum coordinates of
     * this image parameter in the given dimension. */
    Expr max() const;

    /** Get an expression representing the stride of this image in the
     * given dimension */
    Expr stride() const;

    /** Set the min in a given dimension to equal the given
     * expression. Setting the mins to zero may simplify some
     * addressing math. */
    Dimension set_min(const Expr &min);

    /** Set the extent in a given dimension to equal the given
     * expression. Images passed in that fail this check will generate
     * a runtime error. Returns a reference to the ImageParam so that
     * these calls may be chained.
     *
     * This may help the compiler generate better
     * code. E.g:
     \code
     im.dim(0).set_extent(100);
     \endcode
     * tells the compiler that dimension zero must be of extent 100,
     * which may result in simplification of boundary checks. The
     * value can be an arbitrary expression:
     \code
     im.dim(0).set_extent(im.dim(1).extent());
     \endcode
     * declares that im is a square image (of unknown size), whereas:
     \code
     im.dim(0).set_extent((im.dim(0).extent()/32)*32);
     \endcode
     * tells the compiler that the extent is a multiple of 32. */
    Dimension set_extent(const Expr &extent);

    /** Set the stride in a given dimension to equal the given
     * value. This is particularly helpful to set when
     * vectorizing. Known strides for the vectorized dimension
     * generate better code. */
    Dimension set_stride(const Expr &stride);

    /** Set the min and extent in one call. */
    Dimension set_bounds(const Expr &min, const Expr &extent);

    /** Set the min and extent estimates in one call. These values are only
     * used by the auto-scheduler and/or the RunGen tool/ */
    Dimension set_estimate(const Expr &min, const Expr &extent);

    Expr min_estimate() const;
    Expr extent_estimate() const;

    /** Get a different dimension of the same buffer */
    // @{
    Dimension dim(int i) const;
    // @}

private:
    friend class ::Halide::OutputImageParam;

    /** Construct a Dimension representing dimension d of some
     * Parameter p. Only friends may construct
     * these. */
    Dimension(const Parameter &p, int d, Func f);

    Parameter param;
    int d;
    Func f;
};

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_DISTRIBUTE_SHIFTS_H
#define HALIDE_DISTRIBUTE_SHIFTS_H

/** \file
 * A tool to distribute shifts as multiplies, useful for some backends. (e.g. ARM, HVX).
 */


namespace Halide {
namespace Internal {

// Distributes shifts as multiplies. If `multiply_adds` is set,
// then only distributes the patterns `a + widening_shl(b, c)` /
// `a - widening_shl(b, c)` and `a + b << c` / `a - b << c`, to
// produce `a (+/-) widening_mul(b, 1 << c)` and `a (+/-) b * (1 << c)`,
// respectively
Stmt distribute_shifts(const Stmt &stmt, bool multiply_adds);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_EARLY_FREE_H
#define HALIDE_EARLY_FREE_H

/** \file
 * Defines the lowering pass that injects markers just after
 * the last use of each buffer so that they can potentially be freed
 * earlier.
 */


namespace Halide {
namespace Internal {

/** Take a statement with allocations and inject markers (of the form
 * of calls to "mark buffer dead") after the last use of each
 * allocation. Targets may use this to free buffers earlier than the
 * close of their Allocate node. */
Stmt inject_early_frees(const Stmt &s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_ELF_H
#define HALIDE_ELF_H

#include <algorithm>
#include <cstdint>
#include <iterator>
#include <list>
#include <memory>
#include <string>
#include <utility>
#include <vector>

namespace Halide {
namespace Internal {
namespace Elf {

// This ELF parser mostly deserializes the object into a graph
// structure in memory. It replaces indices into tables (sections,
// symbols, etc.) with a weakly referenced graph of pointers. The
// Object datastructure owns all of the objects. This namespace exists
// because it is very difficult to use LLVM's object parser to modify
// an object (it's fine for parsing only). This was built using
// http://www.skyfree.org/linux/references/ELF_Format.pdf as a reference
// for the ELF structs and constants.

class Object;
class Symbol;
class Section;
class Relocation;

// Helpful wrapper to allow range-based for loops.
template<typename T>
class iterator_range {
    T b, e;

public:
    iterator_range(T b, T e)
        : b(b), e(e) {
    }

    T begin() const {
        return b;
    }
    T end() const {
        return e;
    }
};

/** Describes a symbol */
class Symbol {
public:
    enum Binding : uint8_t {
        STB_LOCAL = 0,
        STB_GLOBAL = 1,
        STB_WEAK = 2,
        STB_LOPROC = 13,
        STB_HIPROC = 15,
    };

    enum Type : uint8_t {
        STT_NOTYPE = 0,
        STT_OBJECT = 1,
        STT_FUNC = 2,
        STT_SECTION = 3,
        STT_FILE = 4,
        STT_LOPROC = 13,
        STT_HIPROC = 15,
    };

    enum Visibility : uint8_t {
        STV_DEFAULT = 0,
        STV_INTERNAL = 1,
        STV_HIDDEN = 2,
        STV_PROTECTED = 3,
    };

private:
    std::string name;
    const Section *definition = nullptr;
    uint64_t offset = 0;
    uint32_t size = 0;
    Binding binding = STB_LOCAL;
    Type type = STT_NOTYPE;
    Visibility visibility = STV_DEFAULT;

public:
    Symbol() = default;
    Symbol(const std::string &name)
        : name(name) {
    }

    /** Accesses the name of this symbol. */
    ///@{
    Symbol &set_name(const std::string &name) {
        this->name = name;
        return *this;
    }
    const std::string &get_name() const {
        return name;
    }
    ///@}

    /** Accesses the type of this symbol. */
    ///@{
    Symbol &set_type(Type type) {
        this->type = type;
        return *this;
    }
    Type get_type() const {
        return type;
    }
    ///@}

    /** Accesses the properties that describe the definition of this symbol. */
    ///@{
    Symbol &define(const Section *section, uint64_t offset, uint32_t size) {
        this->definition = section;
        this->offset = offset;
        this->size = size;
        return *this;
    }
    bool is_defined() const {
        return definition != nullptr;
    }
    const Section *get_section() const {
        return definition;
    }
    uint64_t get_offset() const {
        return offset;
    }
    uint32_t get_size() const {
        return size;
    }
    ///@}

    /** Access the binding and visibility of this symbol. See the ELF
     * spec for more information about these properties. */
    ///@{
    Symbol &set_binding(Binding binding) {
        this->binding = binding;
        return *this;
    }
    Symbol &set_visibility(Visibility visibility) {
        this->visibility = visibility;
        return *this;
    }
    Binding get_binding() const {
        return binding;
    }
    Visibility get_visibility() const {
        return visibility;
    }
    ///@}
};

/** Describes a relocation to be applied to an offset of a section in
 * an Object. */
class Relocation {
    uint32_t type = 0;
    uint64_t offset = 0;
    int64_t addend = 0;
    const Symbol *symbol = nullptr;

public:
    Relocation() = default;
    Relocation(uint32_t type, uint64_t offset, int64_t addend, const Symbol *symbol)
        : type(type), offset(offset), addend(addend), symbol(symbol) {
    }

    /** The type of relocation to be applied. The meaning of this
     * value depends on the machine of the object. */
    ///@{
    Relocation &set_type(uint32_t type) {
        this->type = type;
        return *this;
    }
    uint32_t get_type() const {
        return type;
    }
    ///@}

    /** Where to apply the relocation. This is relative to the section
     * the relocation belongs to. */
    ///@{
    Relocation &set_offset(uint64_t offset) {
        this->offset = offset;
        return *this;
    }
    uint64_t get_offset() const {
        return offset;
    }
    ///@}

    /** The value to replace with the relocation is the address of the symbol plus the addend. */
    ///@{
    Relocation &set_symbol(const Symbol *symbol) {
        this->symbol = symbol;
        return *this;
    }
    Relocation &set_addend(int64_t addend) {
        this->addend = addend;
        return *this;
    }
    const Symbol *get_symbol() const {
        return symbol;
    }
    int64_t get_addend() const {
        return addend;
    }
    ///@}
};

/** Describes a section of an object file. */
class Section {
public:
    enum Type : uint32_t {
        SHT_NULL = 0,
        SHT_PROGBITS = 1,
        SHT_SYMTAB = 2,
        SHT_STRTAB = 3,
        SHT_RELA = 4,
        SHT_HASH = 5,
        SHT_DYNAMIC = 6,
        SHT_NOTE = 7,
        SHT_NOBITS = 8,
        SHT_REL = 9,
        SHT_SHLIB = 10,
        SHT_DYNSYM = 11,
        SHT_LOPROC = 0x70000000u,
        SHT_HIPROC = 0x7fffffffu,
        SHT_LOUSER = 0x80000000u,
        SHT_HIUSER = 0xffffffffu,
    };

    enum Flag : uint32_t {
        SHF_WRITE = 0x1,
        SHF_ALLOC = 0x2,
        SHF_EXECINSTR = 0x4,
        SHF_MASKPROC = 0xf0000000u,
    };

    typedef std::vector<Relocation> RelocationList;
    typedef RelocationList::iterator relocation_iterator;
    typedef RelocationList::const_iterator const_relocation_iterator;

    typedef std::vector<char>::iterator contents_iterator;
    typedef std::vector<char>::const_iterator const_contents_iterator;

private:
    std::string name;
    Type type = SHT_NULL;
    uint32_t flags = 0;
    std::vector<char> contents;
    // Sections may have a size larger than the contents.
    uint64_t size = 0;
    uint64_t alignment = 1;
    RelocationList relocs;

public:
    Section() = default;
    Section(const std::string &name, Type type)
        : name(name), type(type) {
    }

    Section &set_name(const std::string &name) {
        this->name = name;
        return *this;
    }
    const std::string &get_name() const {
        return name;
    }

    Section &set_type(Type type) {
        this->type = type;
        return *this;
    }
    Type get_type() const {
        return type;
    }

    Section &set_flag(Flag flag) {
        this->flags |= flag;
        return *this;
    }
    Section &remove_flag(Flag flag) {
        this->flags &= ~flag;
        return *this;
    }
    Section &set_flags(uint32_t flags) {
        this->flags = flags;
        return *this;
    }
    uint32_t get_flags() const {
        return flags;
    }
    bool is_alloc() const {
        return (flags & SHF_ALLOC) != 0;
    }
    bool is_writable() const {
        return (flags & SHF_WRITE) != 0;
    }

    /** Get or set the size of the section. The size may be larger
     * than the content. */
    ///@{
    Section &set_size(uint64_t size) {
        this->size = size;
        return *this;
    }
    uint64_t get_size() const {
        return std::max((uint64_t)size, (uint64_t)contents.size());
    }
    ///@}

    Section &set_alignment(uint64_t alignment) {
        this->alignment = alignment;
        return *this;
    }
    uint64_t get_alignment() const {
        return alignment;
    }

    Section &set_contents(std::vector<char> contents) {
        this->contents = std::move(contents);
        return *this;
    }
    template<typename It>
    Section &set_contents(It begin, It end) {
        this->contents.assign(begin, end);
        return *this;
    }
    template<typename It>
    Section &append_contents(It begin, It end) {
        this->contents.insert(this->contents.end(), begin, end);
        return *this;
    }
    template<typename It>
    Section &prepend_contents(It begin, It end) {
        typedef typename std::iterator_traits<It>::value_type T;
        uint64_t size_bytes = std::distance(begin, end) * sizeof(T);
        this->contents.insert(this->contents.begin(), begin, end);

        // When we add data to the start of the section, we need to fix up
        // the offsets of the relocations linked to this section.
        for (Relocation &r : relocations()) {
            r.set_offset(r.get_offset() + size_bytes);
        }

        return *this;
    }
    /** Set, append or prepend an object to the contents, assuming T is a
     * trivially copyable datatype. */
    template<typename T>
    Section &set_contents(const std::vector<T> &contents) {
        this->contents.assign((const char *)contents.data(), (const char *)(contents.data() + contents.size()));
        return *this;
    }
    template<typename T>
    Section &append_contents(const T &x) {
        return append_contents((const char *)&x, (const char *)(&x + 1));
    }
    template<typename T>
    Section &prepend_contents(const T &x) {
        return prepend_contents((const char *)&x, (const char *)(&x + 1));
    }
    const std::vector<char> &get_contents() const {
        return contents;
    }
    contents_iterator contents_begin() {
        return contents.begin();
    }
    contents_iterator contents_end() {
        return contents.end();
    }
    const_contents_iterator contents_begin() const {
        return contents.begin();
    }
    const_contents_iterator contents_end() const {
        return contents.end();
    }
    const char *contents_data() const {
        return contents.data();
    }
    size_t contents_size() const {
        return contents.size();
    }
    bool contents_empty() const {
        return contents.empty();
    }

    Section &set_relocations(std::vector<Relocation> relocs) {
        this->relocs = std::move(relocs);
        return *this;
    }
    template<typename It>
    Section &set_relocations(It begin, It end) {
        this->relocs.assign(begin, end);
        return *this;
    }
    void add_relocation(const Relocation &reloc) {
        relocs.push_back(reloc);
    }
    relocation_iterator relocations_begin() {
        return relocs.begin();
    }
    relocation_iterator relocations_end() {
        return relocs.end();
    }
    iterator_range<relocation_iterator> relocations() {
        return {relocs.begin(), relocs.end()};
    }
    const_relocation_iterator relocations_begin() const {
        return relocs.begin();
    }
    const_relocation_iterator relocations_end() const {
        return relocs.end();
    }
    iterator_range<const_relocation_iterator> relocations() const {
        return {relocs.begin(), relocs.end()};
    }
    size_t relocations_size() const {
        return relocs.size();
    }
};

/** Base class for a target architecture to implement the target
 * specific aspects of linking. */
class Linker {
public:
    virtual ~Linker() = default;

    virtual uint16_t get_machine() = 0;
    virtual uint32_t get_flags() = 0;
    virtual uint32_t get_version() = 0;
    virtual void append_dynamic(Section &dynamic) = 0;

    /** Add or get an entry to the global offset table (GOT) with a
     * relocation pointing to sym. */
    virtual uint64_t get_got_entry(Section &got, const Symbol &sym) = 0;

    /** Check to see if this relocation should go through the PLT. */
    virtual bool needs_plt_entry(const Relocation &reloc) = 0;

    /** Add a PLT entry for a symbol sym defined externally. Returns a
     * symbol representing the PLT entry. */
    virtual Symbol add_plt_entry(const Symbol &sym, Section &plt, Section &got,
                                 const Symbol &got_sym) = 0;

    /** Perform a relocation. This function may opt to not apply the
     * relocation, and return a new relocation to be performed at
     * runtime. This requires that the section to apply the relocation
     * to is writable at runtime. */
    virtual Relocation relocate(uint64_t fixup_offset, char *fixup_addr, uint64_t type,
                                const Symbol *sym, uint64_t sym_offset, int64_t addend,
                                Section &got) = 0;
};

/** Holds all of the relevant sections and symbols for an object. */
class Object {
public:
    enum Type : uint16_t {
        ET_NONE = 0,
        ET_REL = 1,
        ET_EXEC = 2,
        ET_DYN = 3,
        ET_CORE = 4,
        ET_LOPROC = 0xff00u,
        ET_HIPROC = 0xffffu,
    };

    // We use lists for sections and symbols to avoid iterator
    // invalidation when we modify the containers.
    typedef std::list<Section> SectionList;
    typedef typename SectionList::iterator section_iterator;
    typedef typename SectionList::const_iterator const_section_iterator;

    typedef std::list<Symbol> SymbolList;
    typedef typename SymbolList::iterator symbol_iterator;
    typedef typename SymbolList::const_iterator const_symbol_iterator;

private:
    SectionList secs;
    SymbolList syms;

    Type type = ET_NONE;
    uint16_t machine = 0;
    uint32_t version = 0;
    uint64_t entry = 0;
    uint32_t flags = 0;

    Object(const Object &);
    void operator=(const Object &);

public:
    Object() = default;

    Type get_type() const {
        return type;
    }
    uint16_t get_machine() const {
        return machine;
    }
    uint32_t get_version() const {
        return version;
    }
    uint64_t get_entry() const {
        return entry;
    }
    uint32_t get_flags() const {
        return flags;
    }

    Object &set_type(Type type) {
        this->type = type;
        return *this;
    }
    Object &set_machine(uint16_t machine) {
        this->machine = machine;
        return *this;
    }
    Object &set_version(uint32_t version) {
        this->version = version;
        return *this;
    }
    Object &set_entry(uint64_t entry) {
        this->entry = entry;
        return *this;
    }
    Object &set_flags(uint32_t flags) {
        this->flags = flags;
        return *this;
    }

    /** Parse an object in memory to an Object. */
    static std::unique_ptr<Object> parse_object(const char *data, size_t size);

    /** Write a shared object in memory. */
    std::vector<char> write_shared_object(Linker *linker, const std::vector<std::string> &depedencies = {},
                                          const std::string &soname = "");

    section_iterator sections_begin() {
        return secs.begin();
    }
    section_iterator sections_end() {
        return secs.end();
    }
    iterator_range<section_iterator> sections() {
        return {secs.begin(), secs.end()};
    }
    const_section_iterator sections_begin() const {
        return secs.begin();
    }
    const_section_iterator sections_end() const {
        return secs.end();
    }
    iterator_range<const_section_iterator> sections() const {
        return {secs.begin(), secs.end()};
    }
    size_t sections_size() const {
        return secs.size();
    }
    section_iterator find_section(const std::string &name);

    section_iterator add_section(const std::string &name, Section::Type type);
    section_iterator add_relocation_section(const Section &for_section);
    section_iterator erase_section(section_iterator i) {
        return secs.erase(i);
    }

    section_iterator merge_sections(const std::vector<section_iterator> &sections);
    section_iterator merge_text_sections();

    symbol_iterator symbols_begin() {
        return syms.begin();
    }
    symbol_iterator symbols_end() {
        return syms.end();
    }
    iterator_range<symbol_iterator> symbols() {
        return {syms.begin(), syms.end()};
    }
    const_symbol_iterator symbols_begin() const {
        return syms.begin();
    }
    const_symbol_iterator symbols_end() const {
        return syms.end();
    }
    iterator_range<const_symbol_iterator> symbols() const {
        return {syms.begin(), syms.end()};
    }
    size_t symbols_size() const {
        return syms.size();
    }
    symbol_iterator find_symbol(const std::string &name);
    const_symbol_iterator find_symbol(const std::string &name) const;

    symbol_iterator add_symbol(const std::string &name);
};

}  // namespace Elf
}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_IR_ELIMINATE_BOOL_VECTORS_H
#define HALIDE_IR_ELIMINATE_BOOL_VECTORS_H

/** \file
 * Method to eliminate vectors of booleans from IR.
 */


namespace Halide {
namespace Internal {

/** Some targets treat vectors of bools as integers of the same type that the
 * boolean operation is being used to operate on. For example, instead of
 * select(i1x8, u16x8, u16x8), the target would prefer to see select(u16x8,
 * u16x8, u16x8), where the first argument is a vector of integers representing
 * a mask. This pass converts vectors of bools to vectors of integers to meet
 * this requirement. This is done by injecting intrinsics to convert bools to
 * architecture-specific masks, and using a select_mask intrinsic instead of a
 * Select node. This also converts any intrinsics that operate on vectorized
 * conditions to a *_mask equivalent (if_then_else, require). Because the masks
 * are architecture specific, they may not be stored or loaded. On Stores, the
 * masks are converted to UInt(8) with a value of 0 or 1, which is our canonical
 * in-memory representation of a bool. */
///@{
Stmt eliminate_bool_vectors(const Stmt &s);
Expr eliminate_bool_vectors(const Expr &s);
///@}

/** If a type is a boolean vector, find the type that it has been
 * changed to by eliminate_bool_vectors. */
inline Type eliminated_bool_type(Type bool_type, Type other_type) {
    if (bool_type.is_vector() && bool_type.bits() == 1) {
        bool_type = bool_type.with_code(Type::Int).with_bits(other_type.bits());
    }
    return bool_type;
}

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_EMULATE_FLOAT16_MATH_H
#define HALIDE_EMULATE_FLOAT16_MATH_H

/** \file
 * Methods for dealing with float16 arithmetic using float32 math, by
 * casting back and forth with bit tricks.
 */


namespace Halide {
namespace Internal {

/** Check if a call is a float16 transcendental (e.g. sqrt_f16) */
bool is_float16_transcendental(const Call *);

/** Implement a float16 transcendental using the float32 equivalent. */
Expr lower_float16_transcendental_to_float32_equivalent(const Call *);

/** Cast to/from float and bfloat using bitwise math. */
//@{
Expr float32_to_bfloat16(Expr e);
Expr float32_to_float16(Expr e);
Expr float16_to_float32(Expr e);
Expr bfloat16_to_float32(Expr e);
Expr lower_float16_cast(const Cast *op);
//@}

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_EXPR_USES_VAR_H
#define HALIDE_EXPR_USES_VAR_H

/** \file
 * Defines a method to determine if an expression depends on some variables.
 */


namespace Halide {
namespace Internal {

template<typename T = void>
class ExprUsesVars : public IRGraphVisitor {
    using IRGraphVisitor::visit;

    const Scope<T> &vars;
    Scope<Expr> scope;

    void include(const Expr &e) override {
        if (result) {
            return;
        }
        IRGraphVisitor::include(e);
    }

    void include(const Stmt &s) override {
        if (result) {
            return;
        }
        IRGraphVisitor::include(s);
    }

    void visit_name(const std::string &name) {
        if (vars.contains(name)) {
            result = true;
        } else if (const Expr *e = scope.find(name)) {
            IRGraphVisitor::include(*e);
        }
    }

    void visit(const Variable *op) override {
        visit_name(op->name);
    }

    void visit(const Load *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const Store *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const Call *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const Provide *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const LetStmt *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const Let *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const Realize *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

    void visit(const Allocate *op) override {
        visit_name(op->name);
        IRGraphVisitor::visit(op);
    }

public:
    ExprUsesVars(const Scope<T> &v, const Scope<Expr> *s = nullptr)
        : vars(v) {
        scope.set_containing_scope(s);
    }
    bool result = false;
};

/** Test if a statement or expression references or defines any of the
 *  variables in a scope, additionally considering variables bound to
 *  Expr's in the scope provided in the final argument.
 */
template<typename StmtOrExpr, typename T>
inline bool stmt_or_expr_uses_vars(const StmtOrExpr &e, const Scope<T> &v,
                                   const Scope<Expr> &s = Scope<Expr>::empty_scope()) {
    ExprUsesVars<T> uses(v, &s);
    e.accept(&uses);
    return uses.result;
}

/** Test if a statement or expression references or defines the given
 * variable, additionally considering variables bound to Expr's in the
 * scope provided in the final argument.
 */
template<typename StmtOrExpr>
inline bool stmt_or_expr_uses_var(const StmtOrExpr &e, const std::string &v,
                                  const Scope<Expr> &s = Scope<Expr>::empty_scope()) {
    Scope<> vars;
    vars.push(v);
    return stmt_or_expr_uses_vars<StmtOrExpr, void>(e, vars, s);
}

/** Test if an expression references or defines the given variable,
 *  additionally considering variables bound to Expr's in the scope
 *  provided in the final argument.
 */
inline bool expr_uses_var(const Expr &e, const std::string &v,
                          const Scope<Expr> &s = Scope<Expr>::empty_scope()) {
    return stmt_or_expr_uses_var(e, v, s);
}

/** Test if a statement references or defines the given variable,
 *  additionally considering variables bound to Expr's in the scope
 *  provided in the final argument.
 */
inline bool stmt_uses_var(const Stmt &stmt, const std::string &v,
                          const Scope<Expr> &s = Scope<Expr>::empty_scope()) {
    return stmt_or_expr_uses_var(stmt, v, s);
}

/** Test if an expression references or defines any of the variables
 *  in a scope, additionally considering variables bound to Expr's in
 *  the scope provided in the final argument.
 */
template<typename T>
inline bool expr_uses_vars(const Expr &e, const Scope<T> &v,
                           const Scope<Expr> &s = Scope<Expr>::empty_scope()) {
    return stmt_or_expr_uses_vars(e, v, s);
}

/** Test if a statement references or defines any of the variables in
 *  a scope, additionally considering variables bound to Expr's in the
 *  scope provided in the final argument.
 */
template<typename T>
inline bool stmt_uses_vars(const Stmt &stmt, const Scope<T> &v,
                           const Scope<Expr> &s = Scope<Expr>::empty_scope()) {
    return stmt_or_expr_uses_vars(stmt, v, s);
}

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_EXTERN_H
#define HALIDE_EXTERN_H

/** \file
 *
 * Convenience macros that lift functions that take C types into
 * functions that take and return exprs, and call the original
 * function at runtime under the hood. See test/c_function.cpp for
 * example usage.
 */


#define _halide_check_arg_type(t, name, e, n)                                      \
    _halide_user_assert((e).type() == (t))                                         \
        << "Type mismatch for argument " << (n) << " to extern function " << #name \
        << ". Type expected is " << (t) << " but the argument " << (e)             \
        << " has type " << (e).type() << ".\n";

#define HalideExtern_1(rt, name, t1)                                                                             \
    Halide::Expr name(const Halide::Expr &a1) {                                                                  \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                              \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1}, Halide::Internal::Call::Extern); \
    }

#define HalideExtern_2(rt, name, t1, t2)                                                                             \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2) {                                              \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                  \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                  \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2}, Halide::Internal::Call::Extern); \
    }

#define HalideExtern_3(rt, name, t1, t2, t3)                                                                             \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2, const Halide::Expr &a3) {                          \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                      \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                      \
        _halide_check_arg_type(Halide::type_of<t3>(), name, a3, 3);                                                      \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2, a3}, Halide::Internal::Call::Extern); \
    }

#define HalideExtern_4(rt, name, t1, t2, t3, t4)                                                                             \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2, const Halide::Expr &a3, const Halide::Expr &a4) {      \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                          \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                          \
        _halide_check_arg_type(Halide::type_of<t3>(), name, a3, 3);                                                          \
        _halide_check_arg_type(Halide::type_of<t4>(), name, a4, 4);                                                          \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2, a3, a4}, Halide::Internal::Call::Extern); \
    }

#define HalideExtern_5(rt, name, t1, t2, t3, t4, t5)                                                                                            \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2, const Halide::Expr &a3, const Halide::Expr &a4, const Halide::Expr &a5) { \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                                             \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                                             \
        _halide_check_arg_type(Halide::type_of<t3>(), name, a3, 3);                                                                             \
        _halide_check_arg_type(Halide::type_of<t4>(), name, a4, 4);                                                                             \
        _halide_check_arg_type(Halide::type_of<t5>(), name, a5, 5);                                                                             \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2, a3, a4, a5}, Halide::Internal::Call::Extern);                \
    }

#define HalidePureExtern_1(rt, name, t1)                                                                             \
    Halide::Expr name(const Halide::Expr &a1) {                                                                      \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                  \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1}, Halide::Internal::Call::PureExtern); \
    }

#define HalidePureExtern_2(rt, name, t1, t2)                                                                             \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2) {                                                  \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                      \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                      \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2}, Halide::Internal::Call::PureExtern); \
    }

#define HalidePureExtern_3(rt, name, t1, t2, t3)                                                                             \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2, const Halide::Expr &a3) {                              \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                          \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                          \
        _halide_check_arg_type(Halide::type_of<t3>(), name, a3, 3);                                                          \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2, a3}, Halide::Internal::Call::PureExtern); \
    }

#define HalidePureExtern_4(rt, name, t1, t2, t3, t4)                                                                             \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2, const Halide::Expr &a3, const Halide::Expr &a4) {          \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                              \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                              \
        _halide_check_arg_type(Halide::type_of<t3>(), name, a3, 3);                                                              \
        _halide_check_arg_type(Halide::type_of<t4>(), name, a4, 4);                                                              \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2, a3, a4}, Halide::Internal::Call::PureExtern); \
    }

#define HalidePureExtern_5(rt, name, t1, t2, t3, t4, t5)                                                                                        \
    Halide::Expr name(const Halide::Expr &a1, const Halide::Expr &a2, const Halide::Expr &a3, const Halide::Expr &a4, const Halide::Expr &a5) { \
        _halide_check_arg_type(Halide::type_of<t1>(), name, a1, 1);                                                                             \
        _halide_check_arg_type(Halide::type_of<t2>(), name, a2, 2);                                                                             \
        _halide_check_arg_type(Halide::type_of<t3>(), name, a3, 3);                                                                             \
        _halide_check_arg_type(Halide::type_of<t4>(), name, a4, 4);                                                                             \
        _halide_check_arg_type(Halide::type_of<t5>(), name, a5, 5);                                                                             \
        return Halide::Internal::Call::make(Halide::type_of<rt>(), #name, {a1, a2, a3, a4, a5}, Halide::Internal::Call::PureExtern);            \
    }
#endif
#ifndef HALIDE_EXTRACT_TILE_OPERATIONS_H
#define HALIDE_EXTRACT_TILE_OPERATIONS_H

/** \file
 * Defines the lowering pass that injects calls to tile intrinsics that support
 * AMX instructions.
 */


namespace Halide {
namespace Internal {

/** Rewrite any AMX tile operations that have been stored in the AMXTile memory
 * type as intrinsic calls, to be used in the X86 backend. */
Stmt extract_tile_operations(const Stmt &s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_FAST_INTEGER_DIVIDE_H
#define HALIDE_FAST_INTEGER_DIVIDE_H


namespace Halide {

/** Integer division by small values can be done exactly as multiplies
 * and shifts. This function does integer division for numerators of
 * various integer types (8, 16, 32 bit signed and unsigned)
 * numerators and uint8 denominators. The type of the result is the
 * type of the numerator. The unsigned version is faster than the
 * signed version, so cast the numerator to an unsigned int if you
 * know it's positive.
 *
 * If your divisor is compile-time constant, Halide performs a
 * slightly better optimization automatically, so there's no need to
 * use this function (but it won't hurt).
 *
 * This function vectorizes well on arm, and well on x86 for 16 and 8
 * bit vectors. For 32-bit vectors on x86 you're better off using
 * native integer division.
 *
 * Also, this routine treats division by zero as division by
 * 256. I.e. it interprets the uint8 divisor as a number from 1 to 256
 * inclusive.
 */
Expr fast_integer_divide(const Expr &numerator, const Expr &denominator);

/** A variant of the above which rounds towards zero instead of rounding towards
 * negative infinity. */
Expr fast_integer_divide_round_to_zero(const Expr &numerator, const Expr &denominator);

/** Use the fast integer division tables to implement a modulo
 * operation via the Euclidean identity: a%b = a - (a/b)*b
 */
Expr fast_integer_modulo(const Expr &numerator, const Expr &denominator);

}  // namespace Halide

#endif
#ifndef FIND_CALLS_H
#define FIND_CALLS_H

/** \file
 *
 * Defines analyses to extract the functions called a function.
 */

#include <map>
#include <string>
#include <vector>


namespace Halide {
namespace Internal {

class Function;

/** Construct a map from name to Function definition object for all Halide
 *  functions called directly in the definition of the Function f, including
 *  in update definitions, update index expressions, and RDom extents. This map
 *  _does not_ include the Function f, unless it is called recursively by
 *  itself.
 */
std::map<std::string, Function> find_direct_calls(const Function &f);

/** Construct a map from name to Function definition object for all Halide
 *  functions called directly in the definition of the Function f, or
 *  indirectly in those functions' definitions, recursively. This map always
 *  _includes_ the Function f.
 */
std::map<std::string, Function> find_transitive_calls(const Function &f);

/** Find all Functions transitively referenced by any Function in `funcs` and return
 * a map of them. */
std::map<std::string, Function> build_environment(const std::vector<Function> &funcs);

/** Returns the same Functions as build_environment, but returns a vector of
 * Functions instead, where the order is the order in which the Functions were
 * first encountered. This is stable to changes in the names of the Functions. */
std::vector<Function> called_funcs_in_order_found(const std::vector<Function> &funcs);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_FIND_INTRINSICS_H
#define HALIDE_FIND_INTRINSICS_H

/** \file
 * Tools to replace common patterns with more readily recognizable intrinsics.
 */


namespace Halide {
namespace Internal {

/** Implement intrinsics with non-intrinsic using equivalents. */
Expr lower_widen_right_add(const Expr &a, const Expr &b);
Expr lower_widen_right_mul(const Expr &a, const Expr &b);
Expr lower_widen_right_sub(const Expr &a, const Expr &b);
Expr lower_widening_add(const Expr &a, const Expr &b);
Expr lower_widening_mul(const Expr &a, const Expr &b);
Expr lower_widening_sub(const Expr &a, const Expr &b);
Expr lower_widening_shift_left(const Expr &a, const Expr &b);
Expr lower_widening_shift_right(const Expr &a, const Expr &b);

Expr lower_rounding_shift_left(const Expr &a, const Expr &b);
Expr lower_rounding_shift_right(const Expr &a, const Expr &b);

Expr lower_saturating_add(const Expr &a, const Expr &b);
Expr lower_saturating_sub(const Expr &a, const Expr &b);
Expr lower_saturating_cast(const Type &t, const Expr &a);

Expr lower_halving_add(const Expr &a, const Expr &b);
Expr lower_halving_sub(const Expr &a, const Expr &b);
Expr lower_rounding_halving_add(const Expr &a, const Expr &b);
Expr lower_sorted_avg(const Expr &a, const Expr &b);

Expr lower_mul_shift_right(const Expr &a, const Expr &b, const Expr &q);
Expr lower_rounding_mul_shift_right(const Expr &a, const Expr &b, const Expr &q);

/** Replace one of the above ops with equivalent arithmetic. */
Expr lower_intrinsic(const Call *op);

/** Replace common arithmetic patterns with intrinsics. */
Stmt find_intrinsics(const Stmt &s);
Expr find_intrinsics(const Expr &e);

/** The reverse of find_intrinsics. */
Expr lower_intrinsics(const Expr &e);
Stmt lower_intrinsics(const Stmt &s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_FLATTEN_NESTED_RAMPS_H
#define HALIDE_FLATTEN_NESTED_RAMPS_H

/** \file
 * Defines the lowering pass that flattens nested ramps and broadcasts.
 * */


namespace Halide {
namespace Internal {

/** Take a statement/expression and replace nested ramps and broadcasts. */
Stmt flatten_nested_ramps(const Stmt &s);
Expr flatten_nested_ramps(const Expr &e);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_FUSE_GPU_THREAD_LOOPS_H
#define HALIDE_FUSE_GPU_THREAD_LOOPS_H

/** \file
 * Defines the lowering pass that fuses and normalizes loops over gpu
 * threads to target CUDA, OpenCL, and Metal.
 */


namespace Halide {
namespace Internal {

/** Rewrite all GPU loops to have a min of zero. */
Stmt zero_gpu_loop_mins(const Stmt &s);

/** Converts Halide's GPGPU IR to the OpenCL/CUDA/Metal model. Within
 * every loop over gpu block indices, fuse the inner loops over thread
 * indices into a single loop (with predication to turn off
 * threads). Push if conditions between GPU blocks to the innermost GPU threads.
 * Also injects synchronization points as needed, and hoists
 * shared allocations at the block level out into a single shared
 * memory array, and heap allocations into a slice of a global pool
 * allocated outside the kernel. */
Stmt fuse_gpu_thread_loops(Stmt s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef FUZZ_FLOAT_STORES_H
#define FUZZ_FLOAT_STORES_H


/** \file
 * Defines a lowering pass that messes with floating point stores.
 */

namespace Halide {
namespace Internal {

/** On every store of a floating point value, mask off the
 * least-significant-bit of the mantissa. We've found that whether or
 * not this dramatically changes the output of a pipeline correlates
 * very well with whether or not a pipeline will produce very
 * different outputs on different architectures (e.g. with and without
 * FMA). It's also a useful way to detect bad tests, such as those
 * that expect exact floating point equality across platforms. */
Stmt fuzz_float_stores(const Stmt &s);

}  // namespace Internal
}  // namespace Halide

#endif
#ifndef HALIDE_GENERATOR_H_
#define HALIDE_GENERATOR_H_

/** \file
 *
 * Generator is a class used to encapsulate the building of Funcs in user
 * pipelines. A Generator is agnostic to JIT vs AOT compilation; it can be used for
 * either purpose, but is especially convenient to use for AOT compilation.
 *
 * A Generator explicitly declares the Inputs and Outputs associated for a given
 * pipeline, and (optionally) separates the code for constructing the outputs from the code from
 * scheduling them. For instance:
 *
 * \code
 *     class Blur : public Generator<Blur> {
 *     public:
 *         Input<Func> input{"input", UInt(16), 2};
 *         Output<Func> output{"output", UInt(16), 2};
 *         void generate() {
 *             blur_x(x, y) = (input(x, y) + input(x+1, y) + input(x+2, y))/3;
 *             blur_y(x, y) = (blur_x(x, y) + blur_x(x, y+1) + blur_x(x, y+2))/3;
 *             output(x, y) = blur(x, y);
 *         }
 *         void schedule() {
 *             blur_y.split(y, y, yi, 8).parallel(y).vectorize(x, 8);
 *             blur_x.store_at(blur_y, y).compute_at(blur_y, yi).vectorize(x, 8);
 *         }
 *     private:
 *         Var x, y, xi, yi;
 *         Func blur_x, blur_y;
 *     };
 * \endcode
 *
 * Halide can compile a Generator into the correct pipeline by introspecting these
 * values and constructing an appropriate signature based on them.
 *
 * A Generator provides implementations of two methods:
 *
 *   - generate(), which must fill in all Output Func(s); it may optionally also do scheduling
 *   if no schedule() method is present.
 *   - schedule(), which (if present) should contain all scheduling code.
 *
 * Inputs can be any C++ scalar type:
 *
 * \code
 *     Input<float> radius{"radius"};
 *     Input<int32_t> increment{"increment"};
 * \endcode
 *
 * An Input<Func> is (essentially) like an ImageParam, except that it may (or may
 * not) not be backed by an actual buffer, and thus has no defined extents.
 *
 * \code
 *     Input<Func> input{"input", Float(32), 2};
 * \endcode
 *
 * You can optionally make the type and/or dimensions of Input<Func> unspecified,
 * in which case the value is simply inferred from the actual Funcs passed to them.
 * Of course, if you specify an explicit Type or Dimension, we still require the
 * input Func to match, or a compilation error results.
 *
 * \code
 *     Input<Func> input{ "input", 3 };  // require 3-dimensional Func,
 *                                       // but leave Type unspecified
 * \endcode
 *
 * A Generator must explicitly list the output(s) it produces:
 *
 * \code
 *     Output<Func> output{"output", Float(32), 2};
 * \endcode
 *
 * You can specify an output that returns a Tuple by specifying a list of Types:
 *
 * \code
 *     class Tupler : Generator<Tupler> {
 *       Input<Func> input{"input", Int(32), 2};
 *       Output<Func> output{"output", {Float(32), UInt(8)}, 2};
 *       void generate() {
 *         Var x, y;
 *         Expr a = cast<float>(input(x, y));
 *         Expr b = cast<uint8_t>(input(x, y));
 *         output(x, y) = Tuple(a, b);
 *       }
 *     };
 * \endcode
 *
 * You can also specify Output<X> for any scalar type (except for Handle types);
 * this is merely syntactic sugar on top of a zero-dimensional Func, but can be
 * quite handy, especially when used with multiple outputs:
 *
 * \code
 *     Output<float> sum{"sum"};  // equivalent to Output<Func> {"sum", Float(32), 0}
 * \endcode
 *
 * As with Input<Func>, you can optionally make the type and/or dimensions of an
 * Output<Func> unspecified; any unspecified types must be resolved via an
 * implicit GeneratorParam in order to use top-level compilation.
 *
 * You can also declare an *array* of Input or Output, by using an array type
 * as the type parameter:
 *
 * \code
 *     // Takes exactly 3 images and outputs exactly 3 sums.
 *     class SumRowsAndColumns : Generator<SumRowsAndColumns> {
 *       Input<Func[3]> inputs{"inputs", Float(32), 2};
 *       Input<int32_t[2]> extents{"extents"};
 *       Output<Func[3]> sums{"sums", Float(32), 1};
 *       void generate() {
 *         assert(inputs.size() == sums.size());
 *         // assume all inputs are same extent
 *         Expr width = extent[0];
 *         Expr height = extent[1];
 *         for (size_t i = 0; i < inputs.size(); ++i) {
 *           RDom r(0, width, 0, height);
 *           sums[i]() = 0.f;
 *           sums[i]() += inputs[i](r.x, r.y);
 *          }
 *       }
 *     };
 * \endcode
 *
 * You can also leave array size unspecified, with some caveats:
 *   - For ahead-of-time compilation, Inputs must have a concrete size specified
 *     via a GeneratorParam at build time (e.g., pyramid.size=3)
 *   - For JIT compilation via a Stub, Inputs array sizes will be inferred
 *     from the vector passed.
 *   - For ahead-of-time compilation, Outputs may specify a concrete size
 *     via a GeneratorParam at build time (e.g., pyramid.size=3), or the
 *     size can be specified via a resize() method.
 *
 * \code
 *     class Pyramid : public Generator<Pyramid> {
 *     public:
 *         GeneratorParam<int32_t> levels{"levels", 10};
 *         Input<Func> input{ "input", Float(32), 2 };
 *         Output<Func[]> pyramid{ "pyramid", Float(32), 2 };
 *         void generate() {
 *             pyramid.resize(levels);
 *             pyramid[0](x, y) = input(x, y);
 *             for (int i = 1; i < pyramid.size(); i++) {
 *                 pyramid[i](x, y) = (pyramid[i-1](2*x, 2*y) +
 *                                    pyramid[i-1](2*x+1, 2*y) +
 *                                    pyramid[i-1](2*x, 2*y+1) +
 *                                    pyramid[i-1](2*x+1, 2*y+1))/4;
 *             }
 *         }
 *     };
 * \endcode
 *
 * A Generator can also be customized via compile-time parameters (GeneratorParams),
 * which affect code generation.
 *
 * GeneratorParams, Inputs, and Outputs are (by convention) always
 * public and always declared at the top of the Generator class, in the order
 *
 * \code
 *     GeneratorParam(s)
 *     Input<Func>(s)
 *     Input<non-Func>(s)
 *     Output<Func>(s)
 * \endcode
 *
 * Note that the Inputs and Outputs will appear in the C function call in the order
 * they are declared. All Input<Func> and Output<Func> are represented as halide_buffer_t;
 * all other Input<> are the appropriate C++ scalar type. (GeneratorParams are
 * always referenced by name, not position, so their order is irrelevant.)
 *
 * All Inputs and Outputs must have explicit names, and all such names must match
 * the regex [A-Za-z][A-Za-z_0-9]* (i.e., essentially a C/C++ variable name, with
 * some extra restrictions on underscore use). By convention, the name should match
 * the member-variable name.
 *
 * You can dynamically add Inputs and Outputs to your Generator via adding a
 * configure() method; if present, it will be called before generate(). It can
 * examine GeneratorParams but it may not examine predeclared Inputs or Outputs;
 * the only thing it should do is call add_input<>() and/or add_output<>(), or call
 * set_type()/set_dimensions()/set_array_size() on an Input or Output with an unspecified type.
 * Added inputs will be appended (in order) after predeclared Inputs but before
 * any Outputs; added outputs will be appended after predeclared Outputs.
 *
 * Note that the pointers returned by add_input() and add_output() are owned
 * by the Generator and will remain valid for the Generator's lifetime; user code
 * should not attempt to delete or free them.
 *
 * \code
 *     class MultiSum : public Generator<MultiSum> {
 *     public:
 *         GeneratorParam<int32_t> input_count{"input_count", 10};
 *         Output<Func> output{ "output", Float(32), 2 };
 *
 *         void configure() {
 *             for (int i = 0; i < input_count; ++i) {
 *                 extra_inputs.push_back(
 *                     add_input<Func>("input_" + std::to_string(i), Float(32), 2);
 *             }
 *         }
 *
 *         void generate() {
 *             Expr sum = 0.f;
 *             for (int i = 0; i < input_count; ++i) {
 *                 sum += (*extra_inputs)[i](x, y);
 *             }
 *             output(x, y) = sum;
 *         }
 *     private:
 *         std::vector<Input<Func>* extra_inputs;
 *     };
 * \endcode
 *
 *  All Generators have two GeneratorParams that are implicitly provided
 *  by the base class:
 *
 *      GeneratorParam<Target> target{"target", Target()};
 *      GeneratorParam<AutoschedulerParams> autoscheduler{"autoscheduler", {}}
 *
 *  - 'target' is th