From dbf951df1594a3dec36eca9ab81a0f7ba81b11cd Mon Sep 17 00:00:00 2001
From: Kawrakow <iwankawrakow@gmail.com>
Date: Mon, 21 Oct 2024 12:16:54 +0200
Subject: Enable IQ4_NL for KV-cache in token generation using Flash Attention 
 (#99)

* Enable IQ4_NL for V-cache in token generation

* We don't need these

* Update printour of allowed quantized KV-cache combinations

* Add IQ4_NL + IQ4_NL to FA

This is a better alternative than Q4_0 + Q4_0 for the VRAM poor.

* Remove file added by mistake

* Fix typo, which is not really a bug

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml/src/ggml-cuda/template-instances/generate_cu_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'ggml/src/ggml-cuda/template-instances/generate_cu_files.py')

diff --git a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
index d7874e6e..1186112e 100755
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -3,7 +3,7 @@
 from glob import glob
 import os
 
-TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_F16"]
+TYPES_KV = ["GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0", "GGML_TYPE_IQ4_NL", "GGML_TYPE_F16"]
 
 SOURCE_FATTN_VEC = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
-- 
cgit v1.2.3