summaryrefslogtreecommitdiff
path: root/ggml-impl.h
diff options
context:
space:
mode:
authorKawrakow <48489457+ikawrakow@users.noreply.github.com>2024-07-27 07:55:01 +0200
committerGitHub <noreply@github.com>2024-07-27 07:55:01 +0200
commit154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch)
tree81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /ggml-impl.h
parent0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff)
Merge mainline llama.cpp (#3)
* Merging mainline - WIP * Merging mainline - WIP AVX2 and CUDA appear to work. CUDA performance seems slightly (~1-2%) lower as it is so often the case with llama.cpp/ggml after some "improvements" have been made. * Merging mainline - fix Metal * Remove check --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'ggml-impl.h')
-rw-r--r--ggml-impl.h651
1 files changed, 0 insertions, 651 deletions
diff --git a/ggml-impl.h b/ggml-impl.h
deleted file mode 100644
index 1d233619..00000000
--- a/ggml-impl.h
+++ /dev/null
@@ -1,651 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-// GGML internal header
-
-#include <assert.h>
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
-#include <stddef.h>
-#include <stdbool.h>
-#include <string.h> // memcpy
-#include <math.h> // fabsf
-
-#undef MIN
-#undef MAX
-
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#if defined(_MSC_VER)
-
-#define m512bh(p) p
-#define m512i(p) p
-
-#else
-
-#define m512bh(p) (__m512bh)(p)
-#define m512i(p) (__m512i)(p)
-
-#endif
-
-/**
- * Converts brain16 to float32.
- *
- * The bfloat16 floating point format has the following structure:
- *
- * ┌sign
- * │
- * │ ┌exponent
- * │ │
- * │ │ ┌mantissa
- * │ │ │
- * │┌──┴───┐┌─┴───┐
- * 0b0000000000000000 brain16
- *
- * Since bf16 has the same number of exponent bits as a 32bit float,
- * encoding and decoding numbers becomes relatively straightforward.
- *
- * ┌sign
- * │
- * │ ┌exponent
- * │ │
- * │ │ ┌mantissa
- * │ │ │
- * │┌──┴───┐┌─┴───────────────────┐
- * 0b00000000000000000000000000000000 IEEE binary32
- *
- * For comparison, the standard fp16 format has fewer exponent bits.
- *
- * ┌sign
- * │
- * │ ┌exponent
- * │ │
- * │ │ ┌mantissa
- * │ │ │
- * │┌─┴─┐┌─┴──────┐
- * 0b0000000000000000 IEEE binary16
- *
- * @see IEEE 754-2008
- */
-static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
- union {
- float f;
- uint32_t i;
- } u;
- u.i = (uint32_t)h.bits << 16;
- return u.f;
-}
-
-/**
- * Converts float32 to brain16.
- *
- * This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
- * Subnormals shall be flushed to zero, and NANs will be quiet.
- * This code should vectorize nicely if using modern compilers.
- */
-static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
- ggml_bf16_t h;
- union {
- float f;
- uint32_t i;
- } u;
- u.f = s;
- if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
- h.bits = (u.i >> 16) | 64; /* force to quiet */
- return h;
- }
- if (!(u.i & 0x7f800000)) { /* subnormal */
- h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
- return h;
- }
- h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
- return h;
-}
-
-#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
-#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// static_assert should be a #define, but if it's not,
-// fall back to the _Static_assert C11 keyword.
-// if C99 - static_assert is noop
-// ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef __cplusplus
-#ifndef static_assert
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#else
-#define static_assert(cond, msg) struct global_scope_noop_trick
-#endif
-#endif
-#endif
-
-// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
-#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __FMA__
-#define __FMA__
-#endif
-#ifndef __F16C__
-#define __F16C__
-#endif
-#endif
-
-// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
-#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
-#ifndef __SSE3__
-#define __SSE3__
-#endif
-#ifndef __SSSE3__
-#define __SSSE3__
-#endif
-#endif
-
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-#endif
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#if defined(__ARM_NEON)
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
-#ifdef _MSC_VER
-
-typedef uint16_t ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
-#else
-
-typedef __fp16 ggml_fp16_internal_t;
-
-#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
-#endif // _MSC_VER
-
-#if !defined(__aarch64__)
-
-// 32-bit ARM compatibility
-
-// vaddvq_s16
-// vpaddq_s16
-// vpaddq_s32
-// vaddvq_s32
-// vaddvq_f32
-// vmaxvq_f32
-// vcvtnq_s32_f32
-// vzip1_u8
-// vzip2_u8
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
- return
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
- return vcombine_s16(a0, b0);
-}
-
-inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
- int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
- int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
- return vcombine_s32(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-inline static float vmaxvq_f32(float32x4_t v) {
- return
- MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
- MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
- int32x4_t res;
-
- res[0] = roundf(vgetq_lane_f32(v, 0));
- res[1] = roundf(vgetq_lane_f32(v, 1));
- res[2] = roundf(vgetq_lane_f32(v, 2));
- res[3] = roundf(vgetq_lane_f32(v, 3));
-
- return res;
-}
-
-inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
- uint8x8_t res;
-
- res[0] = a[0]; res[1] = b[0];
- res[2] = a[1]; res[3] = b[1];
- res[4] = a[2]; res[5] = b[2];
- res[6] = a[3]; res[7] = b[3];
-
- return res;
-}
-
-inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
- uint8x8_t res;
-
- res[0] = a[4]; res[1] = b[4];
- res[2] = a[5]; res[3] = b[5];
- res[4] = a[6]; res[5] = b[6];
- res[6] = a[7]; res[7] = b[7];
-
- return res;
-}
-
-// vld1q_s16_x2
-// vld1q_u8_x2
-// vld1q_u8_x4
-// vld1q_s8_x2
-// vld1q_s8_x4
-// TODO: double-check these work correctly
-
-typedef struct ggml_int16x8x2_t {
- int16x8_t val[2];
-} ggml_int16x8x2_t;
-
-inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
- ggml_int16x8x2_t res;
-
- res.val[0] = vld1q_s16(ptr + 0);
- res.val[1] = vld1q_s16(ptr + 8);
-
- return res;
-}
-
-typedef struct ggml_uint8x16x2_t {
- uint8x16_t val[2];
-} ggml_uint8x16x2_t;
-
-inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
- ggml_uint8x16x2_t res;
-
- res.val[0] = vld1q_u8(ptr + 0);
- res.val[1] = vld1q_u8(ptr + 16);
-
- return res;
-}
-
-typedef struct ggml_uint8x16x4_t {
- uint8x16_t val[4];
-} ggml_uint8x16x4_t;
-
-inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
- ggml_uint8x16x4_t res;
-
- res.val[0] = vld1q_u8(ptr + 0);
- res.val[1] = vld1q_u8(ptr + 16);
- res.val[2] = vld1q_u8(ptr + 32);
- res.val[3] = vld1q_u8(ptr + 48);
-
- return res;
-}
-
-typedef struct ggml_int8x16x2_t {
- int8x16_t val[2];
-} ggml_int8x16x2_t;
-
-inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
- ggml_int8x16x2_t res;
-
- res.val[0] = vld1q_s8(ptr + 0);
- res.val[1] = vld1q_s8(ptr + 16);
-
- return res;
-}
-
-typedef struct ggml_int8x16x4_t {
- int8x16_t val[4];
-} ggml_int8x16x4_t;
-
-inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
- ggml_int8x16x4_t res;
-
- res.val[0] = vld1q_s8(ptr + 0);
- res.val[1] = vld1q_s8(ptr + 16);
- res.val[2] = vld1q_s8(ptr + 32);
- res.val[3] = vld1q_s8(ptr + 48);
-
- return res;
-}
-
-// NOTE: not tested
-inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
- int8x16_t res;
-
- res[ 0] = a[b[ 0]];
- res[ 1] = a[b[ 1]];
- res[ 2] = a[b[ 2]];
- res[ 3] = a[b[ 3]];
- res[ 4] = a[b[ 4]];
- res[ 5] = a[b[ 5]];
- res[ 6] = a[b[ 6]];
- res[ 7] = a[b[ 7]];
- res[ 8] = a[b[ 8]];
- res[ 9] = a[b[ 9]];
- res[10] = a[b[10]];
- res[11] = a[b[11]];
- res[12] = a[b[12]];
- res[13] = a[b[13]];
- res[14] = a[b[14]];
- res[15] = a[b[15]];
-
- return res;
-}
-
-// NOTE: not tested
-inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
- uint8x16_t res;
-
- res[ 0] = a[b[ 0]];
- res[ 1] = a[b[ 1]];
- res[ 2] = a[b[ 2]];
- res[ 3] = a[b[ 3]];
- res[ 4] = a[b[ 4]];
- res[ 5] = a[b[ 5]];
- res[ 6] = a[b[ 6]];
- res[ 7] = a[b[ 7]];
- res[ 8] = a[b[ 8]];
- res[ 9] = a[b[ 9]];
- res[10] = a[b[10]];
- res[11] = a[b[11]];
- res[12] = a[b[12]];
- res[13] = a[b[13]];
- res[14] = a[b[14]];
- res[15] = a[b[15]];
-
- return res;
-}
-
-#else
-
-#define ggml_int16x8x2_t int16x8x2_t
-#define ggml_uint8x16x2_t uint8x16x2_t
-#define ggml_uint8x16x4_t uint8x16x4_t
-#define ggml_int8x16x2_t int8x16x2_t
-#define ggml_int8x16x4_t int8x16x4_t
-
-#define ggml_vld1q_s16_x2 vld1q_s16_x2
-#define ggml_vld1q_u8_x2 vld1q_u8_x2
-#define ggml_vld1q_u8_x4 vld1q_u8_x4
-#define ggml_vld1q_s8_x2 vld1q_s8_x2
-#define ggml_vld1q_s8_x4 vld1q_s8_x4
-#define ggml_vqtbl1q_s8 vqtbl1q_s8
-#define ggml_vqtbl1q_u8 vqtbl1q_u8
-
-#endif // !defined(__aarch64__)
-
-#if !defined(__ARM_FEATURE_DOTPROD)
-
-inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
- const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
- const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
-
- return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
-}
-
-#else
-
-#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
-
-#endif // !defined(__ARM_FEATURE_DOTPROD)
-
-#endif // defined(__ARM_NEON)
-
-#if defined(__ARM_NEON) && !defined(_MSC_VER)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- ggml_fp16_internal_t tmp;
- memcpy(&tmp, &h, sizeof(ggml_fp16_t));
- return (float)tmp;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
- ggml_fp16_t res;
- ggml_fp16_internal_t tmp = f;
- memcpy(&res, &tmp, sizeof(ggml_fp16_t));
- return res;
-}
-
-#else
-
-#ifdef __wasm_simd128__
-#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <intrin.h>
-#else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
-#if !defined(__riscv)
-#include <immintrin.h>
-#endif
-#endif
-#endif
-#endif
-#endif
-
-#ifdef __riscv_v_intrinsic
-#include <riscv_vector.h>
-#endif
-
-#if defined(__loongarch64)
-#if defined(__loongarch_asx)
-#include <lasxintrin.h>
-#endif
-#if defined(__loongarch_sx)
-#include <lsxintrin.h>
-#endif
-#endif
-
-#if defined(__loongarch_asx)
-
-typedef union {
- int32_t i;
- float f;
-} ft_union;
-
-/* float type data load instructions */
-static __m128 __lsx_vreplfr2vr_s(float val) {
- ft_union fi_tmpval = {.f = val};
- return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
-}
-
-static __m256 __lasx_xvreplfr2vr_s(float val) {
- ft_union fi_tmpval = {.f = val};
- return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
-}
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- register float f;
- register double d;
- __asm__(
- "mtfprd %0,%2\n"
- "xscvhpdp %0,%0\n"
- "frsp %1,%0\n" :
- /* temp */ "=d"(d),
- /* out */ "=f"(f):
- /* in */ "r"(h));
- return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
- register double d;
- register ggml_fp16_t r;
- __asm__( /* xscvdphp can work on double or single precision */
- "xscvdphp %0,%2\n"
- "mffprd %1,%0\n" :
- /* temp */ "=d"(d),
- /* out */ "=r"(r):
- /* in */ "f"(f));
- return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
- union {
- uint32_t as_bits;
- float as_value;
- } fp32;
- fp32.as_bits = w;
- return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
- union {
- float as_value;
- uint32_t as_bits;
- } fp32;
- fp32.as_value = f;
- return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
- const uint32_t w = (uint32_t) h << 16;
- const uint32_t sign = w & UINT32_C(0x80000000);
- const uint32_t two_w = w + w;
-
- const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
- const float exp_scale = 0x1.0p-112f;
-#else
- const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
- const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
- const uint32_t magic_mask = UINT32_C(126) << 23;
- const float magic_bias = 0.5f;
- const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
- const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
- const uint32_t result = sign |
- (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
- return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
- const float scale_to_inf = 0x1.0p+112f;
- const float scale_to_zero = 0x1.0p-110f;
-#else
- const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
- const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
- float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
- const uint32_t w = fp32_to_bits(f);
- const uint32_t shl1_w = w + w;
- const uint32_t sign = w & UINT32_C(0x80000000);
- uint32_t bias = shl1_w & UINT32_C(0xFF000000);
- if (bias < UINT32_C(0x71000000)) {
- bias = UINT32_C(0x71000000);
- }
-
- base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
- const uint32_t bits = fp32_to_bits(base);
- const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
- const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
- const uint32_t nonsign = exp_bits + mantissa_bits;
- return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
-
-// precomputed f32 table for f16 (256 KB)
-// defined in ggml.c, initialized in ggml_init()
-extern float ggml_table_f32_f16[1 << 16];
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32)
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
- uint16_t s;
- memcpy(&s, &f, sizeof(uint16_t));
- return ggml_table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#endif
-
-#if !defined(GGML_FP32_TO_FP16)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-#endif
-
-#define GGML_HASHTABLE_FULL ((size_t)-1)
-#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
-
-struct ggml_hash_set ggml_hash_set_new(size_t size);
-
-bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
-size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
-size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-// return index, asserts if table is full
-size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
-
-#ifdef __cplusplus
-}
-#endif