Covo-Audio-Chat (Offline Inference)¶
Source https://github.com/vllm-project/vllm-omni/tree/main/examples/offline_inference/covo_audio.
Setup¶
Please refer to the stage configuration documentation to configure memory allocation appropriately for your hardware setup.
Note Covo-Audio code2wav requires
torchdiffeq. Install it with:pip install torchdiffeq
Run examples¶
Get into the example folder:
Audio input chat¶
Using the default audio asset:
Using a custom audio file:
Using a local model:
Command-line Arguments¶
| Argument | Short | Default | Description |
|---|---|---|---|
--model-name | -m | tencent/Covo-Audio-Chat | Model path or HuggingFace model ID |
--text | -t | 请回答这段音频里的问题。 | Text prompt / question for the audio |
--audio-path | -a | default audio asset | Path to local audio file |
--sampling-rate | 16000 | Sampling rate for audio loading (Hz) | |
--output-dir | ./output_audio | Output directory for generated files | |
--num-prompts | 1 | Number of prompts to generate | |
--stage-configs-path | (auto) | Path to stage configs YAML file | |
--log-stats | false | Enable detailed statistics logging | |
--stage-init-timeout | 300 | Stage initialization timeout (seconds) | |
--batch-timeout | 5 | Batching timeout (seconds) | |
--init-timeout | 300 | Overall initialization timeout (seconds) | |
--shm-threshold-bytes | 65536 | Shared memory threshold (bytes) |
Pipeline¶
Covo-Audio-Chat uses a 2-stage pipeline:
- Stage 0 (fused_thinker_talker): The 7B LLM generates interleaved text and audio tokens in a single autoregressive pass.
- Stage 1 (code2wav): A BigVGAN-based vocoder converts the extracted audio codes into a 24kHz WAV waveform.
Output¶
The script generates two files per request in the output directory:
{request_id}.txt-- prompt and generated text{request_id}.wav-- generated audio (24kHz WAV)
FAQ¶
If you encounter ModuleNotFoundError: No module named 'librosa', install it with:
Environment¶
- GPU: 1x A100 (80 GiB)
- Stage 0 (7B LLM): ~16 GiB VRAM
- Stage 1 (BigVGAN vocoder): ~2 GiB VRAM
Example materials¶
end2end.py
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM-Omni for running offline inference
with the correct prompt format on Covo-Audio-Chat.
Usage:
python end2end.py --audio-path /path/to/audio.wav
"""
import os
import soundfile as sf
from vllm.assets.audio import AudioAsset
from vllm.multimodal.media.audio import load_audio
from vllm.sampling_params import SamplingParams
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm_omni.entrypoints.omni import Omni
from vllm_omni.model_executor.models.covo_audio.prompt_utils import (
COVO_AUDIO_INPUT_PREFIX,
build_covo_audio_chat_prompt,
)
SEED = 42
def get_audio_query(
question: str | None = None,
audio_path: str | None = None,
sampling_rate: int = 16000,
) -> dict:
if question is None:
question = "请回答这段音频里的问题。"
user_content = COVO_AUDIO_INPUT_PREFIX + question
prompt = build_covo_audio_chat_prompt(user_content)
if audio_path is None:
audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
else:
import numpy as np
audio_signal, sr = load_audio(audio_path, sr=sampling_rate)
audio_data = (audio_signal.astype(np.float32), sr)
return {
"prompt": prompt,
"multi_modal_data": {"audio": audio_data},
"modalities": ["audio"],
}
def main(args):
query_result = get_audio_query(
question=args.text,
audio_path=args.audio_path,
sampling_rate=args.sampling_rate,
)
omni = Omni(
model=args.model_name,
stage_configs_path=args.stage_configs_path,
log_stats=args.log_stats,
stage_init_timeout=args.stage_init_timeout,
batch_timeout=args.batch_timeout,
init_timeout=args.init_timeout,
shm_threshold_bytes=args.shm_threshold_bytes,
)
# Stage 0: fused_thinker_talker
# stop_token_ids=[151645] (<|im_end|>) and ignore_eos=True are required
# so the model generates interleaved text+audio tokens before stopping.
thinker_sampling_params = SamplingParams(
temperature=0.0,
top_p=1.0,
top_k=-1,
max_tokens=2048,
seed=SEED,
detokenize=True,
repetition_penalty=1.05,
stop_token_ids=[151645],
ignore_eos=True,
)
# Stage 1: code2wav (audio codes, not real token IDs — skip detokenize)
code2wav_sampling_params = SamplingParams(
temperature=0.0,
top_p=1.0,
top_k=-1,
max_tokens=2048,
seed=SEED,
detokenize=False,
repetition_penalty=1.1,
)
sampling_params_list = [
thinker_sampling_params,
code2wav_sampling_params,
]
prompts = [query_result for _ in range(args.num_prompts)]
omni_outputs = omni.generate(prompts, sampling_params_list)
output_dir = args.output_dir
os.makedirs(output_dir, exist_ok=True)
for stage_outputs in omni_outputs:
output = stage_outputs.request_output
if stage_outputs.final_output_type == "text":
request_id = output.request_id
text_output = output.outputs[0].text
prompt_text = output.prompt
out_txt = os.path.join(output_dir, f"{request_id}.txt")
lines = [
"Prompt:\n",
str(prompt_text) + "\n",
"vllm_text_output:\n",
str(text_output).strip() + "\n",
]
try:
with open(out_txt, "w", encoding="utf-8") as f:
f.writelines(lines)
except Exception as e:
print(f"[Warn] Failed writing text file {out_txt}: {e}")
print(f"Request ID: {request_id}, Text saved to {out_txt}")
elif stage_outputs.final_output_type == "audio":
request_id = output.request_id
audio_tensor = output.outputs[0].multimodal_output.get("audio")
if audio_tensor is None:
continue
output_wav = os.path.join(output_dir, f"{request_id}.wav")
audio_numpy = audio_tensor.float().detach().cpu().numpy()
if audio_numpy.ndim > 1:
audio_numpy = audio_numpy.flatten()
sf.write(output_wav, audio_numpy, samplerate=24000, format="WAV")
print(f"Request ID: {request_id}, Audio saved to {output_wav}")
omni.close()
def parse_args():
parser = FlexibleArgumentParser(description="Offline inference demo for Covo-Audio-Chat")
parser.add_argument(
"--model-name",
"-m",
type=str,
default="tencent/Covo-Audio-Chat",
help="Model path or HuggingFace model ID.",
)
parser.add_argument(
"--text",
"-t",
type=str,
default=None,
help="Text prompt / question for the audio.",
)
parser.add_argument(
"--audio-path",
"-a",
type=str,
default=None,
help="Path to local audio file. Uses default asset if not provided.",
)
parser.add_argument(
"--sampling-rate",
type=int,
default=16000,
help="Sampling rate for audio loading (default: 16000).",
)
parser.add_argument(
"--stage-configs-path",
type=str,
default=None,
help="Path to stage configs YAML file.",
)
parser.add_argument(
"--log-stats",
action="store_true",
default=False,
help="Enable writing detailed statistics.",
)
parser.add_argument(
"--stage-init-timeout",
type=int,
default=300,
help="Timeout for initializing a single stage in seconds.",
)
parser.add_argument(
"--batch-timeout",
type=int,
default=5,
help="Timeout for batching in seconds.",
)
parser.add_argument(
"--init-timeout",
type=int,
default=300,
help="Timeout for initializing stages in seconds.",
)
parser.add_argument(
"--shm-threshold-bytes",
type=int,
default=65536,
help="Threshold for using shared memory in bytes.",
)
parser.add_argument(
"--output-dir",
default="./output_audio",
help="Output directory for generated files.",
)
parser.add_argument(
"--num-prompts",
type=int,
default=1,
help="Number of prompts to generate.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)