Streaming Video Generation¶
This example uses the custom WebSocket endpoint WS /v1/realtime/video to receive a video byte stream as chunks are produced. It covers text-only video generation. Image/reference input is intentionally not included for now.
Start The Server¶
Start a diffusion video model with streaming output enabled:
The --diffusion-streaming-output CLI flag is forwarded as streaming_output=True in the default diffusion stage engine_args, then loaded by OmniDiffusionConfig.from_kwargs().
WebSocket Protocol¶
| Direction | Message | Format | Description |
|---|---|---|---|
| Client to server | session.start | JSON text: {"type":"session.start","model":"...","prompt":"...","format":"m4s"} | Starts generation. format is optional and accepts m4s (default). Sampling fields such as width, height, fps, num_frames, and extra_params may be included. |
| Server to client | video.start | JSON text: {"type":"video.start","request_id":"...","format":"m4s","config":{...}} | Confirms the session and mirrors the accepted format. |
| Server to client | Video chunk | Binary WebSocket frame | Fragmented MP4 (m4s) video bytes. |
| Client to server | session.stop | JSON text: {"type":"session.stop"} | Requests cancellation of the active session. |
| Client to server | session.ping | JSON text: {"type":"session.ping"} | Optional keepalive; refreshes the server stall clock. |
| Server to client | session.done | JSON text: {"type":"session.done","request_id":"...","chunks":3,"stopped":false} | Ends a completed or stopped session. |
| Server to client | session.pong | JSON text: {"type":"session.pong"} | Reply to session.ping. |
| Server to client | error | JSON text: {"type":"error","message":"..."} | Reports invalid input, unsupported formats, generation failures, control-message errors, or stall timeout. |
During generation the client normally sends only session.start and then receives binary chunks; silence on the client socket is expected. The server closes the session with a stall error only when there is no engine progress and no session.ping for about 60 seconds.
Install Client Dependency¶
Run The Client¶
python streaming_video_client.py \
--host 127.0.0.1 \
--port 8000 \
--model BestWishYsh/Helios-Distilled \
--prompt "A serene lakeside sunrise with mist over the water." \
--width 640 \
--height 384 \
--fps 16 \
--num-frames 99 \
--guidance-scale 1.0 \
--seed 42 \
--output helios_stream.mp4
The client sends one session.start message, prints each received binary video chunk with its byte size and elapsed time, and saves the received bytes to --output after session.done. The client remuxes the gathered stream to a regular progressive MP4 file so that local playback knows the video duration.
Run The Gradio Demo¶
The Gradio demo requests fMP4 (m4s) chunks and appends them directly in the browser with a Media Source Extensions player.
Model Choice¶
Helios¶
The example uses BestWishYsh/Helios-Distilled model by default.
To ensure streaming-level generation speed, pyramid_num_inference_steps_list is suggested to be as low as [1, 1, 1]. Both example clients uses the following Helios-Distilled preset by default:
{
"is_enable_stage2": true,
"pyramid_num_stages": 3,
"pyramid_num_inference_steps_list": [1, 1, 1],
"is_amplify_first_chunk": true
}
Disable it in the CLI example with --no-helios-distilled-preset, or override/extend it with --extra-params:
Example materials¶
gradio_demo.py
Large file omitted from the rendered docs. View it on GitHub: https://github.com/vllm-project/vllm-omni/blob/main/examples/online_serving/streaming_video_generation/gradio_demo.py.
streaming_video_client.py
Large file omitted from the rendered docs. View it on GitHub: https://github.com/vllm-project/vllm-omni/blob/main/examples/online_serving/streaming_video_generation/streaming_video_client.py.
video-stream-view.html
<!-- This file and the corresponding JS file are needed to display a streamable video view in the Gradio demo.
Gradio's built-in video player does not support modern M4S streaming format. -->
<div id="vllm-streaming-video-view" style="display:flex; flex-direction:column; gap:10px;">
<video id="vllm-streaming-video-player" autoplay muted controls playsinline
style="width:100%; min-height:320px; background:#111; border-radius:8px;"></video>
<div style="display:flex; align-items:center; gap:10px;">
<span id="vllm-streaming-video-status" style="font-weight:600;">Ready</span>
</div>
<pre id="vllm-streaming-video-log" style="height:220px; overflow:auto; margin:0; padding:10px; border-radius:8px;
background:#0f172a; color:#dbeafe; font-size:12px; white-space:pre-wrap;">Ready.</pre>
</div>
video-stream-view.js
/**
* This file and the corresponding HTML file are needed to display a streamable video view in the Gradio demo.
* Gradio's built-in video player does not support modern M4S streaming format.
*/
(function () {
const playerId = "vllm-streaming-video-player";
const statusId = "vllm-streaming-video-status";
const logId = "vllm-streaming-video-log";
let ws = null;
let mediaSource = null;
let sourceBuffer = null;
let queue = [];
let chunkCount = 0;
let totalBytes = 0;
let done = false;
let objectUrl = null;
function el(id) {
return document.getElementById(id);
}
function setStatus(text) {
const node = el(statusId);
if (node) node.textContent = text;
}
function log(line) {
const node = el(logId);
if (!node) return;
node.textContent += "\n" + line;
node.scrollTop = node.scrollHeight;
}
function setStartButton(disabled, label) {
const btn = document.querySelector("#streaming-video-start button") || document.getElementById("streaming-video-start");
if (!btn) return;
btn.disabled = disabled;
btn.textContent = label;
}
function chooseMime() {
const candidates = [
'video/mp4; codecs="avc1.42E01E"',
'video/mp4; codecs="avc1.4D401F"',
'video/mp4; codecs="avc1.64001F"',
"video/mp4"
];
for (const mime of candidates) {
if (window.MediaSource && MediaSource.isTypeSupported(mime)) return mime;
}
return null;
}
function pump() {
if (!sourceBuffer || sourceBuffer.updating || queue.length === 0) return;
try {
sourceBuffer.appendBuffer(queue.shift());
} catch (err) {
done = true;
setStatus("SourceBuffer error");
setStartButton(false, "Restart");
log("SourceBuffer append error: " + err.message);
finishStream();
}
}
function finishStream() {
if (!mediaSource || mediaSource.readyState !== "open") return;
if (sourceBuffer && sourceBuffer.updating) return;
if (queue.length > 0) return;
try {
mediaSource.endOfStream();
} catch (_) { }
}
function stopCurrent() {
if (ws && ws.readyState === WebSocket.OPEN) {
try { ws.send(JSON.stringify({ type: "session.stop" })); } catch (_) { }
try { ws.close(); } catch (_) { }
}
ws = null;
sourceBuffer = null;
mediaSource = null;
queue = [];
if (objectUrl) URL.revokeObjectURL(objectUrl);
objectUrl = null;
}
window.vllmStreamingVideoStart = function (configJson) {
stopCurrent();
chunkCount = 0;
totalBytes = 0;
done = false;
const logNode = el(logId);
if (logNode) logNode.textContent = "Starting...";
setStatus("Starting...");
let config;
try {
config = JSON.parse(configJson);
} catch (err) {
setStatus("Invalid request");
setStartButton(false, "Restart");
log("Invalid request config: " + err.message);
return configJson;
}
const mime = chooseMime();
if (!mime) {
setStatus("MSE unsupported");
setStartButton(false, "Restart");
log("This browser does not support MP4 Media Source Extensions playback.");
return configJson;
}
const video = el(playerId);
if (!video) return configJson;
mediaSource = new MediaSource();
objectUrl = URL.createObjectURL(mediaSource);
video.src = objectUrl;
mediaSource.addEventListener("sourceopen", () => {
try {
sourceBuffer = mediaSource.addSourceBuffer(mime);
sourceBuffer.mode = "segments";
sourceBuffer.addEventListener("updateend", () => {
pump();
if (done) finishStream();
});
} catch (err) {
setStatus("SourceBuffer error");
setStartButton(false, "Restart");
log("SourceBuffer error: " + err.message);
return;
}
ws = new WebSocket(config.url);
ws.binaryType = "arraybuffer";
ws.onopen = () => {
setStatus("Streaming...");
setStartButton(true, "Streaming...");
log("Connected: " + config.url);
ws.send(JSON.stringify(config.payload));
log("Sent session.start: " + JSON.stringify(config.payload));
};
ws.onmessage = (event) => {
if (typeof event.data === "string") {
const msg = JSON.parse(event.data);
if (msg.type === "video.start") {
log("Video session started: request_id=" + (msg.request_id || "") + " format=" + (msg.format || ""));
} else if (msg.type === "session.done") {
done = true;
setStatus("Done");
setStartButton(false, "Restart");
log("Session complete: " + JSON.stringify(msg));
finishStream();
} else if (msg.type === "error") {
done = true;
setStatus("Error");
setStartButton(false, "Restart");
log("ERROR: " + (msg.message || JSON.stringify(msg)));
finishStream();
} else {
log("Control message: " + JSON.stringify(msg));
}
return;
}
const data = event.data;
queue.push(data);
chunkCount += 1;
totalBytes += data.byteLength;
log(`[chunk ${String(chunkCount).padStart(3, "0")}] bytes=${data.byteLength} total_bytes=${totalBytes}`);
pump();
};
ws.onerror = () => {
done = true;
setStatus("Error");
setStartButton(false, "Restart");
log("ERROR: WebSocket error");
finishStream();
};
ws.onclose = () => {
if (!done) {
done = true;
setStatus("Closed");
setStartButton(false, "Restart");
log("WebSocket closed before session.done");
finishStream();
}
};
}, { once: true });
return configJson;
};
console.log("video-stream-view.html loaded");
})();