Aqlm Example#
Source vllm-project/vllm.
1import argparse
2
3from vllm import LLM, SamplingParams
4
5
6def main():
7
8 parser = argparse.ArgumentParser(description='AQLM examples')
9
10 parser.add_argument('--model',
11 '-m',
12 type=str,
13 default=None,
14 help='model path, as for HF')
15 parser.add_argument('--choice',
16 '-c',
17 type=int,
18 default=0,
19 help='known good models by index, [0-4]')
20 parser.add_argument('--tensor_parallel_size',
21 '-t',
22 type=int,
23 default=1,
24 help='tensor parallel size')
25
26 args = parser.parse_args()
27
28 models = [
29 "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
30 "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
31 "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
32 "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
33 "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
34 ]
35
36 model = LLM(args.model if args.model is not None else models[args.choice],
37 tensor_parallel_size=args.tensor_parallel_size)
38
39 sampling_params = SamplingParams(max_tokens=100, temperature=0)
40 outputs = model.generate("Hello my name is",
41 sampling_params=sampling_params)
42 print(outputs[0].outputs[0].text)
43
44
45if __name__ == '__main__':
46 main()