概要 †
Ubuntu 20.04 †
apt -y install nvidia-cudnn python3-pip python3-dev python3-venv gcc g++ cmake jq
pip3 install llama-cpp-python[server]
wget https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_K_M.gguf
python3 -m llama_cpp.server --model ./llama-2-13b-chat.Q4_K_M.gguf --n_gpu_layers -1 --host 0.0.0.0 --port 8000 &
http://localhost:8000/docs
curl -s -XPOST -H 'Content-Type: application/json' localhost:8000/v1/chat/completions -d '{"messages": [{"role": "user", "content": "Tell me about Hiroshima city, Japan."}]}' | jq | sed -e 's/\\n/\n/g'