Tensorize vLLM Model#
Source vllm-project/vllm.
1import argparse
2import dataclasses
3import json
4import os
5import uuid
6
7from vllm import LLM
8from vllm.engine.arg_utils import EngineArgs
9from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
10 TensorizerConfig,
11 tensorize_vllm_model)
12
13# yapf conflicts with isort for this docstring
14# yapf: disable
15"""
16tensorize_vllm_model.py is a script that can be used to serialize and
17deserialize vLLM models. These models can be loaded using tensorizer
18to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
19or locally. Tensor encryption and decryption is also supported, although
20libsodium must be installed to use it. Install vllm with tensorizer support
21using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
22https://github.com/coreweave/tensorizer
23
24To serialize a model, install vLLM from source, then run something
25like this from the root level of this repository:
26
27python -m examples.tensorize_vllm_model \
28 --model facebook/opt-125m \
29 serialize \
30 --serialized-directory s3://my-bucket \
31 --suffix v1
32
33Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
34and saves it to your S3 bucket. A local directory can also be used. This
35assumes your S3 credentials are specified as environment variables
36in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and
37`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide
38`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint`
39as CLI args to this script.
40
41You can also encrypt the model weights with a randomly-generated key by
42providing a `--keyfile` argument.
43
44To deserialize a model, you can run something like this from the root
45level of this repository:
46
47python -m examples.tensorize_vllm_model \
48 --model EleutherAI/gpt-j-6B \
49 --dtype float16 \
50 deserialize \
51 --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
52
53Which downloads the model tensors from your S3 bucket and deserializes them.
54
55You can also provide a `--keyfile` argument to decrypt the model weights if
56they were serialized with encryption.
57
58To support distributed tensor-parallel models, each model shard will be
59serialized to a separate file. The tensorizer_uri is then specified as a string
60template with a format specifier such as '%03d' that will be rendered with the
61shard's rank. Sharded models serialized with this script will be named as
62model-rank-%03d.tensors
63
64For more information on the available arguments for serializing, run
65`python -m examples.tensorize_vllm_model serialize --help`.
66
67Or for deserializing:
68
69`python -m examples.tensorize_vllm_model deserialize --help`.
70
71Once a model is serialized, tensorizer can be invoked with the `LLM` class
72directly to load models:
73
74 llm = LLM(model="facebook/opt-125m",
75 load_format="tensorizer",
76 model_loader_extra_config=TensorizerConfig(
77 tensorizer_uri = path_to_tensors,
78 num_readers=3,
79 )
80 )
81
82A serialized model can be used during model loading for the vLLM OpenAI
83inference server. `model_loader_extra_config` is exposed as the CLI arg
84`--model-loader-extra-config`, and accepts a JSON string literal of the
85TensorizerConfig arguments desired.
86
87In order to see all of the available arguments usable to configure
88loading with tensorizer that are given to `TensorizerConfig`, run:
89
90`python -m examples.tensorize_vllm_model deserialize --help`
91
92under the `tensorizer options` section. These can also be used for
93deserialization in this example script, although `--tensorizer-uri` and
94`--path-to-tensors` are functionally the same in this case.
95"""
96
97
98def parse_args():
99 parser = argparse.ArgumentParser(
100 description="An example script that can be used to serialize and "
101 "deserialize vLLM models. These models "
102 "can be loaded using tensorizer directly to the GPU "
103 "extremely quickly. Tensor encryption and decryption is "
104 "also supported, although libsodium must be installed to "
105 "use it.")
106 parser = EngineArgs.add_cli_args(parser)
107 subparsers = parser.add_subparsers(dest='command')
108
109 serialize_parser = subparsers.add_parser(
110 'serialize', help="Serialize a model to `--serialized-directory`")
111
112 serialize_parser.add_argument(
113 "--suffix",
114 type=str,
115 required=False,
116 help=(
117 "The suffix to append to the serialized model directory, which is "
118 "used to construct the location of the serialized model tensors, "
119 "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
120 "`--suffix` is `v1`, the serialized model tensors will be "
121 "saved to "
122 "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
123 "If none is provided, a random UUID will be used."))
124 serialize_parser.add_argument(
125 "--serialized-directory",
126 type=str,
127 required=True,
128 help="The directory to serialize the model to. "
129 "This can be a local directory or S3 URI. The path to where the "
130 "tensors are saved is a combination of the supplied `dir` and model "
131 "reference ID. For instance, if `dir` is the serialized directory, "
132 "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
133 "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
134 "where `suffix` is given by `--suffix` or a random UUID if not "
135 "provided.")
136
137 serialize_parser.add_argument(
138 "--keyfile",
139 type=str,
140 required=False,
141 help=("Encrypt the model weights with a randomly-generated binary key,"
142 " and save the key at this path"))
143
144 deserialize_parser = subparsers.add_parser(
145 'deserialize',
146 help=("Deserialize a model from `--path-to-tensors`"
147 " to verify it can be loaded and used."))
148
149 deserialize_parser.add_argument(
150 "--path-to-tensors",
151 type=str,
152 required=True,
153 help="The local path or S3 URI to the model tensors to deserialize. ")
154
155 deserialize_parser.add_argument(
156 "--keyfile",
157 type=str,
158 required=False,
159 help=("Path to a binary key to use to decrypt the model weights,"
160 " if the model was serialized with encryption"))
161
162 TensorizerArgs.add_cli_args(deserialize_parser)
163
164 return parser.parse_args()
165
166
167
168def deserialize():
169 llm = LLM(model=args.model,
170 load_format="tensorizer",
171 tensor_parallel_size=args.tensor_parallel_size,
172 model_loader_extra_config=tensorizer_config
173 )
174 return llm
175
176
177if __name__ == '__main__':
178 args = parse_args()
179
180 s3_access_key_id = (getattr(args, 's3_access_key_id', None)
181 or os.environ.get("S3_ACCESS_KEY_ID", None))
182 s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
183 or os.environ.get("S3_SECRET_ACCESS_KEY", None))
184 s3_endpoint = (getattr(args, 's3_endpoint', None)
185 or os.environ.get("S3_ENDPOINT_URL", None))
186
187 credentials = {
188 "s3_access_key_id": s3_access_key_id,
189 "s3_secret_access_key": s3_secret_access_key,
190 "s3_endpoint": s3_endpoint
191 }
192
193 model_ref = args.model
194
195 model_name = model_ref.split("/")[1]
196
197 keyfile = args.keyfile if args.keyfile else None
198
199 if args.model_loader_extra_config:
200 config = json.loads(args.model_loader_extra_config)
201 tensorizer_args = \
202 TensorizerConfig(**config)._construct_tensorizer_args()
203 tensorizer_args.tensorizer_uri = args.path_to_tensors
204 else:
205 tensorizer_args = None
206
207 if args.command == "serialize":
208 eng_args_dict = {f.name: getattr(args, f.name) for f in
209 dataclasses.fields(EngineArgs)}
210
211 engine_args = EngineArgs.from_cli_args(
212 argparse.Namespace(**eng_args_dict)
213 )
214
215 input_dir = args.serialized_directory.rstrip('/')
216 suffix = args.suffix if args.suffix else uuid.uuid4().hex
217 base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
218 if engine_args.tensor_parallel_size > 1:
219 model_path = f"{base_path}/model-rank-%03d.tensors"
220 else:
221 model_path = f"{base_path}/model.tensors"
222
223 tensorizer_config = TensorizerConfig(
224 tensorizer_uri=model_path,
225 encryption_keyfile=keyfile,
226 **credentials)
227
228 tensorize_vllm_model(engine_args, tensorizer_config)
229
230 elif args.command == "deserialize":
231 if not tensorizer_args:
232 tensorizer_config = TensorizerConfig(
233 tensorizer_uri=args.path_to_tensors,
234 encryption_keyfile = keyfile,
235 **credentials
236 )
237 deserialize()
238 else:
239 raise ValueError("Either serialize or deserialize must be specified.")