Aqlm Example#
Source vllm-project/vllm.
1from vllm import LLM, SamplingParams
2from vllm.utils import FlexibleArgumentParser
3
4
5def main():
6
7 parser = FlexibleArgumentParser(description='AQLM examples')
8
9 parser.add_argument('--model',
10 '-m',
11 type=str,
12 default=None,
13 help='model path, as for HF')
14 parser.add_argument('--choice',
15 '-c',
16 type=int,
17 default=0,
18 help='known good models by index, [0-4]')
19 parser.add_argument('--tensor-parallel-size',
20 '-t',
21 type=int,
22 default=1,
23 help='tensor parallel size')
24
25 args = parser.parse_args()
26
27 models = [
28 "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
29 "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
30 "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
31 "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
32 "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
33 ]
34
35 model = LLM(args.model if args.model is not None else models[args.choice],
36 tensor_parallel_size=args.tensor_parallel_size)
37
38 sampling_params = SamplingParams(max_tokens=100, temperature=0)
39 outputs = model.generate("Hello my name is",
40 sampling_params=sampling_params)
41 print(outputs[0].outputs[0].text)
42
43
44if __name__ == '__main__':
45 main()