Tensorize vLLM Model#

Source vllm-project/vllm.

  1import argparse
  2import dataclasses
  3import json
  4import os
  5import uuid
  6
  7from vllm import LLM
  8from vllm.engine.arg_utils import EngineArgs
  9from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
 10                                                         TensorizerConfig,
 11                                                         tensorize_vllm_model)
 12from vllm.utils import FlexibleArgumentParser
 13
 14# yapf conflicts with isort for this docstring
 15# yapf: disable
 16"""
 17tensorize_vllm_model.py is a script that can be used to serialize and 
 18deserialize vLLM models. These models can be loaded using tensorizer 
 19to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
 20or locally. Tensor encryption and decryption is also supported, although 
 21libsodium must be installed to use it. Install vllm with tensorizer support 
 22using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
 23https://github.com/coreweave/tensorizer
 24
 25To serialize a model, install vLLM from source, then run something 
 26like this from the root level of this repository:
 27
 28python -m examples.tensorize_vllm_model \
 29   --model facebook/opt-125m \
 30   serialize \
 31   --serialized-directory s3://my-bucket \
 32   --suffix v1
 33   
 34Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
 35and saves it to your S3 bucket. A local directory can also be used. This
 36assumes your S3 credentials are specified as environment variables
 37in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
 38`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
 39`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
 40as CLI args to this script.
 41
 42You can also encrypt the model weights with a randomly-generated key by 
 43providing a `--keyfile` argument.
 44
 45To deserialize a model, you can run something like this from the root 
 46level of this repository:
 47
 48python -m examples.tensorize_vllm_model \
 49   --model EleutherAI/gpt-j-6B \
 50   --dtype float16 \
 51   deserialize \
 52   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
 53
 54Which downloads the model tensors from your S3 bucket and deserializes them.
 55
 56You can also provide a `--keyfile` argument to decrypt the model weights if 
 57they were serialized with encryption.
 58
 59To support distributed tensor-parallel models, each model shard will be
 60serialized to a separate file. The tensorizer_uri is then specified as a string
 61template with a format specifier such as '%03d' that will be rendered with the
 62shard's rank. Sharded models serialized with this script will be named as
 63model-rank-%03d.tensors
 64
 65For more information on the available arguments for serializing, run 
 66`python -m examples.tensorize_vllm_model serialize --help`.
 67
 68Or for deserializing:
 69
 70`python -m examples.tensorize_vllm_model deserialize --help`.
 71
 72Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 73directly to load models:
 74
 75    llm = LLM(model="facebook/opt-125m",
 76              load_format="tensorizer",
 77              model_loader_extra_config=TensorizerConfig(
 78                    tensorizer_uri = path_to_tensors,
 79                    num_readers=3,
 80                    )
 81              )
 82            
 83A serialized model can be used during model loading for the vLLM OpenAI
 84inference server. `model_loader_extra_config` is exposed as the CLI arg
 85`--model-loader-extra-config`, and accepts a JSON string literal of the
 86TensorizerConfig arguments desired.
 87
 88In order to see all of the available arguments usable to configure 
 89loading with tensorizer that are given to `TensorizerConfig`, run:
 90
 91`python -m examples.tensorize_vllm_model deserialize --help`
 92
 93under the `tensorizer options` section. These can also be used for
 94deserialization in this example script, although `--tensorizer-uri` and
 95`--path-to-tensors` are functionally the same in this case.
 96"""
 97
 98
 99def parse_args():
100    parser = FlexibleArgumentParser(
101        description="An example script that can be used to serialize and "
102        "deserialize vLLM models. These models "
103        "can be loaded using tensorizer directly to the GPU "
104        "extremely quickly. Tensor encryption and decryption is "
105        "also supported, although libsodium must be installed to "
106        "use it.")
107    parser = EngineArgs.add_cli_args(parser)
108    subparsers = parser.add_subparsers(dest='command')
109
110    serialize_parser = subparsers.add_parser(
111        'serialize', help="Serialize a model to `--serialized-directory`")
112
113    serialize_parser.add_argument(
114        "--suffix",
115        type=str,
116        required=False,
117        help=(
118            "The suffix to append to the serialized model directory, which is "
119            "used to construct the location of the serialized model tensors, "
120            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
121            "`--suffix` is `v1`, the serialized model tensors will be "
122            "saved to "
123            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
124            "If none is provided, a random UUID will be used."))
125    serialize_parser.add_argument(
126        "--serialized-directory",
127        type=str,
128        required=True,
129        help="The directory to serialize the model to. "
130        "This can be a local directory or S3 URI. The path to where the "
131        "tensors are saved is a combination of the supplied `dir` and model "
132        "reference ID. For instance, if `dir` is the serialized directory, "
133        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
134        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
135        "where `suffix` is given by `--suffix` or a random UUID if not "
136        "provided.")
137
138    serialize_parser.add_argument(
139        "--keyfile",
140        type=str,
141        required=False,
142        help=("Encrypt the model weights with a randomly-generated binary key,"
143              " and save the key at this path"))
144
145    deserialize_parser = subparsers.add_parser(
146        'deserialize',
147        help=("Deserialize a model from `--path-to-tensors`"
148              " to verify it can be loaded and used."))
149
150    deserialize_parser.add_argument(
151        "--path-to-tensors",
152        type=str,
153        required=True,
154        help="The local path or S3 URI to the model tensors to deserialize. ")
155
156    deserialize_parser.add_argument(
157        "--keyfile",
158        type=str,
159        required=False,
160        help=("Path to a binary key to use to decrypt the model weights,"
161              " if the model was serialized with encryption"))
162
163    TensorizerArgs.add_cli_args(deserialize_parser)
164
165    return parser.parse_args()
166
167
168
169def deserialize():
170    llm = LLM(model=args.model,
171              load_format="tensorizer",
172              tensor_parallel_size=args.tensor_parallel_size,
173              model_loader_extra_config=tensorizer_config
174    )
175    return llm
176
177
178if __name__ == '__main__':
179    args = parse_args()
180
181    s3_access_key_id = (getattr(args, 's3_access_key_id', None)
182                        or os.environ.get("S3_ACCESS_KEY_ID", None))
183    s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
184                            or os.environ.get("S3_SECRET_ACCESS_KEY", None))
185    s3_endpoint = (getattr(args, 's3_endpoint', None)
186                or os.environ.get("S3_ENDPOINT_URL", None))
187
188    credentials = {
189        "s3_access_key_id": s3_access_key_id,
190        "s3_secret_access_key": s3_secret_access_key,
191        "s3_endpoint": s3_endpoint
192    }
193
194    model_ref = args.model
195
196    model_name = model_ref.split("/")[1]
197
198    keyfile = args.keyfile if args.keyfile else None
199
200    if args.model_loader_extra_config:
201        config = json.loads(args.model_loader_extra_config)
202        tensorizer_args = \
203            TensorizerConfig(**config)._construct_tensorizer_args()
204        tensorizer_args.tensorizer_uri = args.path_to_tensors
205    else:
206        tensorizer_args = None
207
208    if args.command == "serialize":
209        eng_args_dict = {f.name: getattr(args, f.name) for f in
210                        dataclasses.fields(EngineArgs)}
211
212        engine_args = EngineArgs.from_cli_args(
213            argparse.Namespace(**eng_args_dict)
214        )
215
216        input_dir = args.serialized_directory.rstrip('/')
217        suffix = args.suffix if args.suffix else uuid.uuid4().hex
218        base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
219        if engine_args.tensor_parallel_size > 1:
220            model_path = f"{base_path}/model-rank-%03d.tensors"
221        else:
222            model_path = f"{base_path}/model.tensors"
223
224        tensorizer_config = TensorizerConfig(
225            tensorizer_uri=model_path,
226            encryption_keyfile=keyfile,
227            **credentials)
228
229        tensorize_vllm_model(engine_args, tensorizer_config)
230
231    elif args.command == "deserialize":
232        if not tensorizer_args:
233            tensorizer_config = TensorizerConfig(
234                tensorizer_uri=args.path_to_tensors,
235                encryption_keyfile = keyfile,
236                **credentials
237            )
238        deserialize()
239    else:
240        raise ValueError("Either serialize or deserialize must be specified.")