diff options
author | Kawrakow <48489457+ikawrakow@users.noreply.github.com> | 2024-07-27 07:55:01 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-27 07:55:01 +0200 |
commit | 154e0d75fccf1784fe9ff6fd76a630b66563da3d (patch) | |
tree | 81ce6dbb5b1900c1aa78a879f0593c694cab9d27 /examples/server_embd.py | |
parent | 0684c3e9c70d49323b4fc517128cbe222cab7f96 (diff) |
Merge mainline llama.cpp (#3)
* Merging mainline - WIP
* Merging mainline - WIP
AVX2 and CUDA appear to work.
CUDA performance seems slightly (~1-2%) lower as it is so often
the case with llama.cpp/ggml after some "improvements" have been made.
* Merging mainline - fix Metal
* Remove check
---------
Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Diffstat (limited to 'examples/server_embd.py')
-rw-r--r-- | examples/server_embd.py | 35 |
1 files changed, 35 insertions, 0 deletions
diff --git a/examples/server_embd.py b/examples/server_embd.py new file mode 100644 index 00000000..0e34c6ce --- /dev/null +++ b/examples/server_embd.py @@ -0,0 +1,35 @@ +import asyncio +import asyncio.threads +import requests +import numpy as np + + +n = 8 + +result = [] + +async def requests_post_async(*args, **kwargs): + return await asyncio.threads.to_thread(requests.post, *args, **kwargs) + +async def main(): + model_url = "http://127.0.0.1:6900" + responses: list[requests.Response] = await asyncio.gather(*[requests_post_async( + url= f"{model_url}/embedding", + json= {"content": str(0)*1024} + ) for i in range(n)]) + + for response in responses: + embedding = response.json()["embedding"] + print(embedding[-8:]) + result.append(embedding) + +asyncio.run(main()) + +# compute cosine similarity + +for i in range(n-1): + for j in range(i+1, n): + embedding1 = np.array(result[i]) + embedding2 = np.array(result[j]) + similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) + print(f"Similarity between {i} and {j}: {similarity:.2f}") |