Text generation inference
TGI Modules

Install Anaconda Get Started
conda create -n text-generation-inference python=3.9 conda activate text-generation-inference git clone https://github.com/huggingface/text-generation-inference cd text-generation-inference # protobuf PROTOC_ZIP=protoc-21.12-linux-x86_64.zip curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP unzip -o $PROTOC_ZIP -d /usr/local bin/protoc unzip -o $PROTOC_ZIP -d /usr/local 'include/*' rm -f $PROTOC_ZIP # need DISABLE_CUSTOM_KERNELS=True for usual gpus BUILD_EXTENSIONS=True make install
launch
port=8080 model=meta-llama/Llama-2-7b-chat-hf quantize=gptq # gptq or bitsandbytes or bitsandbytes-fp4 or bitsandbytes-nf4 text-generation-launcher --model-id $model --quantize $quantize --port $port # or without quantization text-generation-launcher --model-id $model --port $port make run-falcon-7b-instruct # for quantization pip install tokenizers transformers vllm exllamav2 make run-falcon-7b-instruct-quantize
test
curl 127.0.0.1:8080/generate \ -X POST \ -d '{"inputs":"What is Deep Learning?","parameters":{"max_new_tokens":20}}' \ -H 'Content-Type: application/json'