hooks : setting up flake8 and pre-commit hooks (#1681)

Small, non-functional changes were made to non-compliant files. These include breaking up long lines, whitespace sanitation and unused import removal. Maximum line length in python files was set to a generous 125 chars, in order to minimize number of changes needed in scripts and general annoyance. The "txt" prompts directory is excluded from the checks as it may contain oddly formatted files and strings for a good reason. Signed-off-by: Jiri Podivin <jpodivin@gmail.com>
author: Jiří Podivín <66251151+jpodivin@users.noreply.github.com> 2023-06-17 12:32:48 +0200
committer: GitHub <noreply@github.com> 2023-06-17 13:32:48 +0300
commit: 5ddf7ea1fb42bac21026de2f77e0f9c069b92234 (patch)
tree: 4a2a5f1ad07c6135b421903c64bbb39ceaa49e83
parent: bac19927c302737465a1deb14ac0943a221863e8 (diff)
5 files changed, 42 insertions, 12 deletions
diff --git a/.flake8 b/.flake8
new file mode 100644
index 00000000..113ca5fd
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 125
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..65796fe2
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,15 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+exclude: prompts/.*.txt
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v3.2.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-added-large-files
+- repo: https://github.com/PyCQA/flake8
+  rev: 6.0.0
+  hooks:
+  -   id: flake8
diff --git a/convert.py b/convert.py
index ece5a026..265c41fa 100644
--- a/convert.py
+++ b/convert.py
@@ -512,7 +512,11 @@ class LazyTensor:
             if not isinstance(self.data_type, QuantizedDataType):
                 raise Exception(f"Can't turn an unquantized tensor into a quantized type ({data_type})")
             if self.data_type.have_g_idx:
-                sys.stderr.write("Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), which is not yet natively supported by GGML.  For now you can still convert this model by passing `--outtype f16` to dequantize, but that will result in a much larger output file for no quality benefit.\n")
+                sys.stderr.write(
+                    "Error: Input uses the newer GPTQ-for-LLaMa format (using g_idx), "
+                    "which is not yet natively supported by GGML. "
+                    "For now you can still convert this model by passing `--outtype f16` to dequantize, "
+                    "but that will result in a much larger output file for no quality benefit.\n")
                 sys.exit(1)
             assert not data_type.have_g_idx and self.data_type.have_addends and data_type.have_addends
 
@@ -694,8 +698,9 @@ class LazyUnpickler(pickle.Unpickler):
         description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
         return LazyStorage(load=load, kind=pid[1], description=description)
 
-   # @staticmethod
-    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,  # pyright: ignore[reportSelfClsParameterName]
+    # @staticmethod
+    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
                                requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
         assert isinstance(storage, LazyStorage)
 
@@ -812,7 +817,7 @@ def lazy_load_ggml_file(fp: io.BufferedReader, path: Path) -> ModelPlus:
     # Use mmap for the actual data to avoid race conditions with the file offset.
     off = fp.raw.tell()
     mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
-    fp.raw.seek(off) # needed on Windows
+    fp.raw.seek(off)  # needed on Windows
 
     def read_tensor() -> None:  # this is a function so that variables captured in `load` don't change
         shape_len, name_len, ftype = struct.unpack("iii", must_read(fp, 12))
@@ -1054,7 +1059,7 @@ def load_some_model(path: Path) -> ModelPlus:
         files = list(path.glob("model-00001-of-*.safetensors"))
         if not files:
             # Try the PyTorch patterns too, with lower priority
-            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
             files = [file for glob in globs for file in path.glob(glob)]
         if not files:
             # Try GGML too, but with lower priority, since if both a non-GGML
@@ -1094,7 +1099,9 @@ def load_vocab(path: Path) -> SentencePieceVocab:
         elif path3.exists():
             path = path3
         else:
-            raise FileNotFoundError(f"Could not find tokenizer.model in {path} or its parent; if it's in another directory, pass the directory as --vocab-dir")
+            raise FileNotFoundError(
+                f"Could not find tokenizer.model in {path} or its parent; "
+                "if it's in another directory, pass the directory as --vocab-dir")
     added_tokens_path = path.parent / "added_tokens.json"
     print(f"Loading vocab file {path}")
     return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
@@ -1110,7 +1117,9 @@ def default_outfile(model_paths: List[Path], params: Params) -> Path:
     }[params.file_type]
     ret = model_paths[0].parent / f"ggml-model-{namestr}.bin"
     if ret in model_paths:
-        sys.stderr.write(f"Error: Default output path ({ret}) would overwrite the input.  Please explicitly specify a path using --outfile.\n")
+        sys.stderr.write(
+            f"Error: Default output path ({ret}) would overwrite the input. "
+            "Please explicitly specify a path using --outfile.\n")
         sys.exit(1)
     return ret
 
@@ -1131,7 +1140,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     parser.add_argument("--outtype", choices=["f32", "f16", "q4_1", "q4_0"], help="output format (default: based on input)")
     parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
     parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
-    parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("model", type=Path,
+                        help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
     args = parser.parse_args(args_in)
 
     vocab: Vocab
diff --git a/examples/jeopardy/graph.py b/examples/jeopardy/graph.py
index d00b2865..1b6c54bf 100644
--- a/examples/jeopardy/graph.py
+++ b/examples/jeopardy/graph.py
@@ -1,5 +1,5 @@
 import matplotlib.pyplot as plt
-import sys, os
+import os
 import csv
 
 labels = []
@@ -8,6 +8,7 @@ numEntries = 1
 
 rows = []
 
+
 def bar_chart(numbers, labels, pos):
     plt.bar(pos, numbers, color='blue')
     plt.xticks(ticks=pos, labels=labels)
@@ -16,6 +17,7 @@ def bar_chart(numbers, labels, pos):
     plt.ylabel("Questions Correct")
     plt.show()
 
+
 def calculatecorrect():
     directory = os.fsencode("./examples/jeopardy/results/")
     csv_reader = csv.reader(open("./examples/jeopardy/qasheet.csv", 'rt'), delimiter=',')
@@ -38,14 +40,13 @@ def calculatecorrect():
                     print(line)
                 else:
                     print("Correct answer: " + rows[i][2] + "\n")
-                    i+=1
+                    i += 1
                     print("Did the AI get the question right? (y/n)")
                     if input() == "y":
                         totalcorrect += 1
             numbers.append(totalcorrect)
 
 
-
 if __name__ == '__main__':
     calculatecorrect()
     pos = list(range(numEntries))
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
index 2ce57282..d1274828 100644
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,9 +1,10 @@
 import os
 import hashlib
 
+
 def sha256sum(file):
     block_size = 16 * 1024 * 1024  # 16 MB block size
-    b  = bytearray(block_size)
+    b = bytearray(block_size)
     file_hash = hashlib.sha256()
     mv = memoryview(b)
     with open(file, 'rb', buffering=0) as f:
@@ -15,6 +16,7 @@ def sha256sum(file):
 
     return file_hash.hexdigest()
 
+
 # Define the path to the llama directory (parent folder of script directory)
 llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
author	Jiří Podivín <66251151+jpodivin@users.noreply.github.com>	2023-06-17 12:32:48 +0200
committer	GitHub <noreply@github.com>	2023-06-17 13:32:48 +0300
commit	5ddf7ea1fb42bac21026de2f77e0f9c069b92234 (patch)
tree	4a2a5f1ad07c6135b421903c64bbb39ceaa49e83
parent	bac19927c302737465a1deb14ac0943a221863e8 (diff)