ltx-2-TURBO

Running on Zero

App Files Files

smartdigitalnetworks commited on Apr 24

Commit

6000543

verified ·

1 Parent(s): 7ad0131

Upload 4 files

Browse files

Files changed (4) hide show

appdistilled.py +318 -0
appfirstlastframe.py +542 -0
appoutpaint.py +1246 -0
appsync.py +1317 -0

appdistilled.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+import subprocess
+import sys
+# Disable torch.compile / dynamo before any torch import
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+# Install xformers for memory-efficient attention
+subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
+# Clone LTX-2 repo and install packages
+LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
+LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
+LTX_COMMIT_SHA = "ae855f8538843825f9015a419cf4ba5edaf5eec2"
+if not os.path.exists(LTX_REPO_DIR):
+    print(f"Cloning {LTX_REPO_URL}...")
+    os.makedirs(LTX_REPO_DIR)
+    subprocess.run(["git", "init", LTX_REPO_DIR], check=True)
+    subprocess.run(["git", "remote", "add", "origin", LTX_REPO_URL], cwd=LTX_REPO_DIR, check=True)
+    subprocess.run(["git", "fetch", "--depth", "1", "origin", LTX_COMMIT_SHA], cwd=LTX_REPO_DIR, check=True)
+    subprocess.run(["git", "checkout", LTX_COMMIT_SHA], cwd=LTX_REPO_DIR, check=True)
+print("Installing ltx-core and ltx-pipelines from cloned repo...")
+subprocess.run(
+    [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e",
+     os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
+     "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines")],
+    check=True,
+)
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
+import logging
+import random
+import tempfile
+from pathlib import Path
+import torch
+torch._dynamo.config.suppress_errors = True
+torch._dynamo.config.disable = True
+import spaces
+import gradio as gr
+import numpy as np
+from huggingface_hub import hf_hub_download, snapshot_download
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+from ltx_core.quantization import QuantizationPolicy
+from ltx_pipelines.distilled import DistilledPipeline
+from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.media_io import encode_video
+# Force-patch xformers attention into the LTX attention module.
+from ltx_core.model.transformer import attention as _attn_mod
+print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+try:
+    from xformers.ops import memory_efficient_attention as _mea
+    _attn_mod.memory_efficient_attention = _mea
+    print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+except Exception as e:
+    print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
+logging.getLogger().setLevel(logging.INFO)
+MAX_SEED = np.iinfo(np.int32).max
+DEFAULT_PROMPT = (
+    "An astronaut hatches from a fragile egg on the surface of the Moon, "
+    "the shell cracking and peeling apart in gentle low-gravity motion. "
+    "Fine lunar dust lifts and drifts outward with each movement, floating "
+    "in slow arcs before settling back onto the ground."
+)
+DEFAULT_FRAME_RATE = 24.0
+# Resolution presets: (width, height)
+RESOLUTIONS = {
+    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
+    "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
+}
+# Model repos
+LTX_MODEL_REPO = "Lightricks/LTX-2.3"
+GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+# Download model checkpoints
+print("=" * 80)
+print("Downloading LTX-2.3 distilled model + Gemma...")
+print("=" * 80)
+checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled-1.1.safetensors")
+spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors")
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+print(f"Checkpoint: {checkpoint_path}")
+print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Gemma root: {gemma_root}")
+# Initialize pipeline WITH text encoder
+pipeline = DistilledPipeline(
+    distilled_checkpoint_path=checkpoint_path,
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
+    loras=[],
+    quantization=QuantizationPolicy.fp8_cast(),
+)
+# Preload all models for ZeroGPU tensor packing.
+print("Preloading all models (including Gemma)...")
+ledger = pipeline.model_ledger
+_transformer = ledger.transformer()
+_video_encoder = ledger.video_encoder()
+_video_decoder = ledger.video_decoder()
+_audio_decoder = ledger.audio_decoder()
+_vocoder = ledger.vocoder()
+_spatial_upsampler = ledger.spatial_upsampler()
+_text_encoder = ledger.text_encoder()
+_embeddings_processor = ledger.gemma_embeddings_processor()
+ledger.transformer = lambda: _transformer
+ledger.video_encoder = lambda: _video_encoder
+ledger.video_decoder = lambda: _video_decoder
+ledger.audio_decoder = lambda: _audio_decoder
+ledger.vocoder = lambda: _vocoder
+ledger.spatial_upsampler = lambda: _spatial_upsampler
+ledger.text_encoder = lambda: _text_encoder
+ledger.gemma_embeddings_processor = lambda: _embeddings_processor
+print("All models preloaded (including Gemma text encoder)!")
+print("=" * 80)
+print("Pipeline ready!")
+print("=" * 80)
+def log_memory(tag: str):
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024**3
+        peak = torch.cuda.max_memory_allocated() / 1024**3
+        free, total = torch.cuda.mem_get_info()
+        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
+def detect_aspect_ratio(image) -> str:
+    """Detect the closest aspect ratio (16:9, 9:16, or 1:1) from an image."""
+    if image is None:
+        return "16:9"
+    if hasattr(image, "size"):
+        w, h = image.size
+    elif hasattr(image, "shape"):
+        h, w = image.shape[:2]
+    else:
+        return "16:9"
+    ratio = w / h
+    candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
+    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def on_image_upload(image, high_res):
+    """Auto-set resolution when image is uploaded."""
+    aspect = detect_aspect_ratio(image)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+def on_highres_toggle(image, high_res):
+    """Update resolution when high-res toggle changes."""
+    aspect = detect_aspect_ratio(image)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+@spaces.GPU(duration=75)
+@torch.inference_mode()
+def generate_video(
+    input_image,
+    prompt: str,
+    duration: float,
+    enhance_prompt: bool = True,
+    seed: int = 42,
+    randomize_seed: bool = True,
+    height: int = 1024,
+    width: int = 1536,
+    progress=gr.Progress(track_tqdm=True),
+):
+    try:
+        torch.cuda.reset_peak_memory_stats()
+        log_memory("start")
+        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+        frame_rate = DEFAULT_FRAME_RATE
+        num_frames = int(duration * frame_rate) + 1
+        num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
+        print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
+        images = []
+        if input_image is not None:
+            output_dir = Path("outputs")
+            output_dir.mkdir(exist_ok=True)
+            temp_image_path = output_dir / f"temp_input_{current_seed}.jpg"
+            if hasattr(input_image, "save"):
+                input_image.save(temp_image_path)
+            else:
+                temp_image_path = Path(input_image)
+            images = [ImageConditioningInput(path=str(temp_image_path), frame_idx=0, strength=1.0)]
+        tiling_config = TilingConfig.default()
+        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        log_memory("before pipeline call")
+        video, audio = pipeline(
+            prompt=prompt,
+            seed=current_seed,
+            height=int(height),
+            width=int(width),
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            images=images,
+            tiling_config=tiling_config,
+            enhance_prompt=enhance_prompt,
+        )
+        log_memory("after pipeline call")
+        output_path = tempfile.mktemp(suffix=".mp4")
+        encode_video(
+            video=video,
+            fps=frame_rate,
+            audio=audio,
+            output_path=output_path,
+            video_chunks_number=video_chunks_number,
+        )
+        log_memory("after encode_video")
+        return str(output_path), current_seed
+    except Exception as e:
+        import traceback
+        log_memory("on error")
+        print(f"Error: {str(e)}\n{traceback.format_exc()}")
+        return None, current_seed
+with gr.Blocks(title="LTX-2.3 Distilled") as demo:
+    gr.Markdown("# LTX-2.3 Distilled (22B): Fast Audio-Video Generation")
+    gr.Markdown(
+        "Fast and high quality video + audio generation "
+        "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
+        "[[code]](https://github.com/Lightricks/LTX-2)"
+    )
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label="Input Image (Optional)", type="pil")
+            prompt = gr.Textbox(
+                label="Prompt",
+                info="for best results - make it as elaborate as possible",
+                value="Make this image come alive with cinematic motion, smooth animation",
+                lines=3,
+                placeholder="Describe the motion and animation you want...",
+            )
+            with gr.Row():
+                duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+                with gr.Column():
+                    enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
+                    high_res = gr.Checkbox(label="High Resolution", value=True)
+            generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
+            with gr.Accordion("Advanced Settings", open=False):
+                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                with gr.Row():
+                    width = gr.Number(label="Width", value=1536, precision=0)
+                    height = gr.Number(label="Height", value=1024, precision=0)
+        with gr.Column():
+            output_video = gr.Video(label="Generated Video", autoplay=True)
+    # Auto-detect aspect ratio from uploaded image and set resolution
+    input_image.change(
+        fn=on_image_upload,
+        inputs=[input_image, high_res],
+        outputs=[width, height],
+    )
+    # Update resolution when high-res toggle changes
+    high_res.change(
+        fn=on_highres_toggle,
+        inputs=[input_image, high_res],
+        outputs=[width, height],
+    )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            input_image, prompt, duration, enhance_prompt,
+            seed, randomize_seed, height, width,
+        ],
+        outputs=[output_video, seed],
+    )
+css = """
+.fillable{max-width: 1200px !important}
+.progress-text {color: white}
+"""
+if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus(), css=css)

appfirstlastframe.py ADDED Viewed

	@@ -0,0 +1,542 @@

+import os
+import subprocess
+import sys
+# Disable torch.compile / dynamo before any torch import
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+# Install xformers for memory-efficient attention
+subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
+# Clone LTX-2 repo at a pinned compatible commit and install packages
+LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
+LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
+LTX_COMMIT = "ae855f8538843825f9015a419cf4ba5edaf5eec2"
+if os.path.exists(LTX_REPO_DIR):
+    print(f"Removing existing repo at {LTX_REPO_DIR}...")
+    subprocess.run(["rm", "-rf", LTX_REPO_DIR], check=True)
+print(f"Cloning {LTX_REPO_URL}...")
+subprocess.run(["git", "clone", LTX_REPO_URL, LTX_REPO_DIR], check=True)
+print(f"Checking out commit {LTX_COMMIT}...")
+subprocess.run(["git", "-C", LTX_REPO_DIR, "checkout", LTX_COMMIT], check=True)
+print("Installing ltx-core and ltx-pipelines from pinned repo commit...")
+subprocess.run(
+    [
+        sys.executable, "-m", "pip", "install",
+        "--force-reinstall", "--no-deps",
+        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
+        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines"),
+    ],
+    check=True,
+)
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
+import logging
+import random
+import tempfile
+from pathlib import Path
+import torch
+torch._dynamo.config.suppress_errors = True
+torch._dynamo.config.disable = True
+import spaces
+import gradio as gr
+import numpy as np
+from huggingface_hub import hf_hub_download, snapshot_download
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number, decode_video as vae_decode_video
+from ltx_core.quantization import QuantizationPolicy
+from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
+from ltx_pipelines.distilled import DistilledPipeline
+from ltx_pipelines.utils import euler_denoising_loop
+from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.helpers import (
+    cleanup_memory,
+    combined_image_conditionings,
+    denoise_video_only,
+    encode_prompts,
+    simple_denoising_func,
+)
+from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
+# Force-patch xformers attention into the LTX attention module.
+from ltx_core.model.transformer import attention as _attn_mod
+print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+try:
+    from xformers.ops import memory_efficient_attention as _mea
+    _attn_mod.memory_efficient_attention = _mea
+    print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+except Exception as e:
+    print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
+logging.getLogger().setLevel(logging.INFO)
+MAX_SEED = np.iinfo(np.int32).max
+DEFAULT_PROMPT = (
+    "An astronaut hatches from a fragile egg on the surface of the Moon, "
+    "the shell cracking and peeling apart in gentle low-gravity motion. "
+    "Fine lunar dust lifts and drifts outward with each movement, floating "
+    "in slow arcs before settling back onto the ground."
+)
+DEFAULT_FRAME_RATE = 24.0
+# Resolution presets: (width, height)
+RESOLUTIONS = {
+    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
+    "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
+}
+class LTX23DistilledA2VPipeline(DistilledPipeline):
+    """DistilledPipeline with optional audio conditioning."""
+    def __call__(
+        self,
+        prompt: str,
+        seed: int,
+        height: int,
+        width: int,
+        num_frames: int,
+        frame_rate: float,
+        images: list[ImageConditioningInput],
+        audio_path: str | None = None,
+        tiling_config: TilingConfig | None = None,
+        enhance_prompt: bool = False,
+    ):
+        # Standard path when no audio input is provided.
+        if audio_path is None:
+            return super().__call__(
+                prompt=prompt,
+                seed=seed,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                tiling_config=tiling_config,
+                enhance_prompt=enhance_prompt,
+            )
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        dtype = torch.bfloat16
+        (ctx_p,) = encode_prompts(
+            [prompt],
+            self.model_ledger,
+            enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0].path if len(images) > 0 else None,
+        )
+        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        video_duration = num_frames / frame_rate
+        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+        if decoded_audio is None:
+            raise ValueError(f"Could not extract audio stream from {audio_path}")
+        encoded_audio_latent = vae_encode_audio(decoded_audio, self.model_ledger.audio_encoder())
+        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
+        expected_frames = audio_shape.frames
+        actual_frames = encoded_audio_latent.shape[2]
+        if actual_frames > expected_frames:
+            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+        elif actual_frames < expected_frames:
+            pad = torch.zeros(
+                encoded_audio_latent.shape[0],
+                encoded_audio_latent.shape[1],
+                expected_frames - actual_frames,
+                encoded_audio_latent.shape[3],
+                device=encoded_audio_latent.device,
+                dtype=encoded_audio_latent.dtype,
+            )
+            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+        video_encoder = self.model_ledger.video_encoder()
+        transformer = self.model_ledger.transformer()
+        stage_1_sigmas = torch.tensor(DISTILLED_SIGMA_VALUES, device=self.device)
+        def denoising_loop(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                ),
+            )
+        stage_1_output_shape = VideoPixelShape(
+            batch=1,
+            frames=num_frames,
+            width=width // 2,
+            height=height // 2,
+            fps=frame_rate,
+        )
+        stage_1_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state = denoise_video_only(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
+            noiser=noiser,
+            sigmas=stage_1_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.model_ledger.spatial_upsampler(),
+        )
+        stage_2_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device)
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        video_state = denoise_video_only(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=stage_2_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=stage_2_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        del transformer
+        del video_encoder
+        cleanup_memory()
+        decoded_video = vae_decode_video(
+            video_state.latent,
+            self.model_ledger.video_decoder(),
+            tiling_config,
+            generator,
+        )
+        original_audio = Audio(
+            waveform=decoded_audio.waveform.squeeze(0),
+            sampling_rate=decoded_audio.sampling_rate,
+        )
+        return decoded_video, original_audio
+# Model repos
+LTX_MODEL_REPO = "Lightricks/LTX-2.3"
+GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+# Download model checkpoints
+print("=" * 80)
+print("Downloading LTX-2.3 distilled model + Gemma...")
+print("=" * 80)
+checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled-1.1.safetensors")
+spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors")
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+print(f"Checkpoint: {checkpoint_path}")
+print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Gemma root: {gemma_root}")
+# Initialize pipeline WITH text encoder and optional audio support
+pipeline = LTX23DistilledA2VPipeline(
+    distilled_checkpoint_path=checkpoint_path,
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
+    loras=[],
+    quantization=QuantizationPolicy.fp8_cast(),
+)
+# Preload all models for ZeroGPU tensor packing.
+print("Preloading all models (including Gemma and audio components)...")
+ledger = pipeline.model_ledger
+_transformer = ledger.transformer()
+_video_encoder = ledger.video_encoder()
+_video_decoder = ledger.video_decoder()
+_audio_encoder = ledger.audio_encoder()
+_audio_decoder = ledger.audio_decoder()
+_vocoder = ledger.vocoder()
+_spatial_upsampler = ledger.spatial_upsampler()
+_text_encoder = ledger.text_encoder()
+_embeddings_processor = ledger.gemma_embeddings_processor()
+ledger.transformer = lambda: _transformer
+ledger.video_encoder = lambda: _video_encoder
+ledger.video_decoder = lambda: _video_decoder
+ledger.audio_encoder = lambda: _audio_encoder
+ledger.audio_decoder = lambda: _audio_decoder
+ledger.vocoder = lambda: _vocoder
+ledger.spatial_upsampler = lambda: _spatial_upsampler
+ledger.text_encoder = lambda: _text_encoder
+ledger.gemma_embeddings_processor = lambda: _embeddings_processor
+print("All models preloaded (including Gemma text encoder and audio encoder)!")
+print("=" * 80)
+print("Pipeline ready!")
+print("=" * 80)
+def log_memory(tag: str):
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024**3
+        peak = torch.cuda.max_memory_allocated() / 1024**3
+        free, total = torch.cuda.mem_get_info()
+        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
+def detect_aspect_ratio(image) -> str:
+    if image is None:
+        return "16:9"
+    if hasattr(image, "size"):
+        w, h = image.size
+    elif hasattr(image, "shape"):
+        h, w = image.shape[:2]
+    else:
+        return "16:9"
+    ratio = w / h
+    candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
+    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def on_image_upload(first_image, last_image, high_res):
+    ref_image = first_image if first_image is not None else last_image
+    aspect = detect_aspect_ratio(ref_image)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+def on_highres_toggle(first_image, last_image, high_res):
+    ref_image = first_image if first_image is not None else last_image
+    aspect = detect_aspect_ratio(ref_image)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+@spaces.GPU(duration=75)
+@torch.inference_mode()
+def generate_video(
+    first_image,
+    last_image,
+    input_audio,
+    prompt: str,
+    duration: float,
+    enhance_prompt: bool = True,
+    seed: int = 42,
+    randomize_seed: bool = True,
+    height: int = 1024,
+    width: int = 1536,
+    progress=gr.Progress(track_tqdm=True),
+):
+    try:
+        torch.cuda.reset_peak_memory_stats()
+        log_memory("start")
+        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+        frame_rate = DEFAULT_FRAME_RATE
+        num_frames = int(duration * frame_rate) + 1
+        num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
+        print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
+        images = []
+        output_dir = Path("outputs")
+        output_dir.mkdir(exist_ok=True)
+        if first_image is not None:
+            temp_first_path = output_dir / f"temp_first_{current_seed}.jpg"
+            if hasattr(first_image, "save"):
+                first_image.save(temp_first_path)
+            else:
+                temp_first_path = Path(first_image)
+            images.append(ImageConditioningInput(path=str(temp_first_path), frame_idx=0, strength=1.0))
+        if last_image is not None:
+            temp_last_path = output_dir / f"temp_last_{current_seed}.jpg"
+            if hasattr(last_image, "save"):
+                last_image.save(temp_last_path)
+            else:
+                temp_last_path = Path(last_image)
+            images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
+        tiling_config = TilingConfig.default()
+        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        log_memory("before pipeline call")
+        video, audio = pipeline(
+            prompt=prompt,
+            seed=current_seed,
+            height=int(height),
+            width=int(width),
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            images=images,
+            audio_path=input_audio,
+            tiling_config=tiling_config,
+            enhance_prompt=enhance_prompt,
+        )
+        log_memory("after pipeline call")
+        output_path = tempfile.mktemp(suffix=".mp4")
+        encode_video(
+            video=video,
+            fps=frame_rate,
+            audio=audio,
+            output_path=output_path,
+            video_chunks_number=video_chunks_number,
+        )
+        log_memory("after encode_video")
+        return str(output_path), current_seed
+    except Exception as e:
+        import traceback
+        log_memory("on error")
+        print(f"Error: {str(e)}\n{traceback.format_exc()}")
+        return None, current_seed
+with gr.Blocks(title="LTX-2.3 Distilled") as demo:
+    gr.Markdown("# LTX-2.3 F2LF: Fast Audio-Video Generation with Frame Conditioning")
+    gr.Markdown(
+        "Fast and high quality video + audio generation with first and last frame conditioning and optional audio input "
+        "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
+        "[[code]](https://github.com/Lightricks/LTX-2)"
+    )
+    with gr.Row():
+        with gr.Column():
+            with gr.Row():
+                first_image = gr.Image(label="First Frame (Optional)", type="pil")
+                last_image = gr.Image(label="Last Frame (Optional)", type="pil")
+            input_audio = gr.Audio(label="Audio Input (Optional)", type="filepath")
+            prompt = gr.Textbox(
+                label="Prompt",
+                info="for best results - make it as elaborate as possible",
+                value="Make this image come alive with cinematic motion, smooth animation",
+                lines=3,
+                placeholder="Describe the motion and animation you want...",
+            )
+            duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+            generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
+            with gr.Accordion("Advanced Settings", open=False):
+                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=10, step=1)
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                with gr.Row():
+                    width = gr.Number(label="Width", value=1536, precision=0)
+                    height = gr.Number(label="Height", value=1024, precision=0)
+                with gr.Row():
+                    enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
+                    high_res = gr.Checkbox(label="High Resolution", value=True)
+        with gr.Column():
+            output_video = gr.Video(label="Generated Video", autoplay=True)
+    gr.Examples(
+        examples=[
+            [
+                None,
+                "pinkknit.jpg",
+                None,
+                "The camera falls downward through darkness as if dropped into a tunnel. "
+                "As it slows, five friends wearing pink knitted hats and sunglasses lean "
+                "over and look down toward the camera with curious expressions. The lens "
+                "has a strong fisheye effect, creating a circular frame around them. They "
+                "crowd together closely, forming a symmetrical cluster while staring "
+                "directly into the lens.",
+                3.0,
+                False,
+                42,
+                True,
+                1024,
+                1024,
+            ],
+        ],
+        inputs=[
+            first_image, last_image, input_audio, prompt, duration,
+            enhance_prompt, seed, randomize_seed, height, width,
+        ],
+    )
+    first_image.change(
+        fn=on_image_upload,
+        inputs=[first_image, last_image, high_res],
+        outputs=[width, height],
+    )
+    last_image.change(
+        fn=on_image_upload,
+        inputs=[first_image, last_image, high_res],
+        outputs=[width, height],
+    )
+    high_res.change(
+        fn=on_highres_toggle,
+        inputs=[first_image, last_image, high_res],
+        outputs=[width, height],
+    )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            first_image, last_image, input_audio, prompt, duration, enhance_prompt,
+            seed, randomize_seed, height, width,
+        ],
+        outputs=[output_video, seed],
+    )
+css = """
+.fillable{max-width: 1200px !important}
+"""
+if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus(), css=css)

appoutpaint.py ADDED Viewed

	@@ -0,0 +1,1246 @@

+import os
+import subprocess
+import sys
+# Disable torch.compile / dynamo before any torch import
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+# Install xformers for memory-efficient attention
+subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
+# Install video preprocessing dependencies
+subprocess.run([sys.executable, "-m", "pip", "install",
+                "imageio[ffmpeg]", "scikit-image",
+                "opencv-python-headless", "decord", "num2words"], check=False)
+# Ensure num2words is installed (required by SmolVLMProcessor)
+subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)
+# Reinstall torchaudio to match the torch CUDA version on this space.
+_tv = subprocess.run([sys.executable, "-c", "import torch; print(torch.__version__)"],
+                     capture_output=True, text=True)
+if _tv.returncode == 0:
+    _full_ver = _tv.stdout.strip()
+    _cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124"
+    _base_ver = _full_ver.split("+")[0]
+    print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...")
+    subprocess.run([
+        sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps",
+        f"torchaudio=={_base_ver}",
+        "--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}",
+    ], check=False)
+# Clone LTX-2 repo at a pinned commit and install packages
+LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
+LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
+LTX_COMMIT = "ae855f8538843825f9015a419cf4ba5edaf5eec2"
+if os.path.exists(LTX_REPO_DIR):
+    print(f"Removing existing repo at {LTX_REPO_DIR}...")
+    subprocess.run(["rm", "-rf", LTX_REPO_DIR], check=True)
+print(f"Cloning {LTX_REPO_URL}...")
+subprocess.run(["git", "clone", LTX_REPO_URL, LTX_REPO_DIR], check=True)
+print(f"Checking out commit {LTX_COMMIT}...")
+subprocess.run(["git", "-C", LTX_REPO_DIR, "checkout", LTX_COMMIT], check=True)
+print("Installing ltx-core and ltx-pipelines from pinned repo commit...")
+subprocess.run(
+    [
+        sys.executable, "-m", "pip", "install",
+        "--force-reinstall", "--no-deps",
+        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
+        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines"),
+    ],
+    check=True,
+)
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
+import logging
+import random
+import tempfile
+from pathlib import Path
+import torch
+torch._dynamo.config.suppress_errors = True
+torch._dynamo.config.disable = True
+import spaces
+import gradio as gr
+import numpy as np
+from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors import safe_open
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.conditioning import (
+    ConditioningItem,
+    ConditioningItemAttentionStrengthWrapper,
+    VideoConditionByReferenceLatent,
+)
+from ltx_core.loader import LoraPathStrengthAndSDOps
+from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
+from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, VideoEncoder, get_video_chunks_number
+from ltx_core.model.video_vae import decode_video as vae_decode_video
+from ltx_core.quantization import QuantizationPolicy
+from ltx_core.types import Audio, AudioLatentShape, LatentState, VideoLatentShape, VideoPixelShape
+from ltx_pipelines.utils import ModelLedger, euler_denoising_loop
+from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.helpers import (
+    assert_resolution,
+    cleanup_memory,
+    combined_image_conditionings,
+    denoise_audio_video,
+    denoise_video_only,
+    encode_prompts,
+    get_device,
+    simple_denoising_func,
+)
+from ltx_pipelines.utils.media_io import (
+    decode_audio_from_file,
+    encode_video,
+    load_video_conditioning,
+)
+from ltx_pipelines.utils.types import PipelineComponents
+# Force-patch xformers attention into the LTX attention module.
+from ltx_core.model.transformer import attention as _attn_mod
+print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+try:
+    from xformers.ops import memory_efficient_attention as _mea
+    _attn_mod.memory_efficient_attention = _mea
+    print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+except Exception as e:
+    print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
+logging.getLogger().setLevel(logging.INFO)
+# ──────────────���──────────────────────────────────────────────────────────────
+# Video Preprocessing: Letterboxing / Outpainting preparation
+# ─────────────────────────────────────────────────────────────────────────────
+import imageio
+import cv2
+from PIL import Image
+def load_video_frames(video_path: str) -> list[np.ndarray]:
+    """Load video frames as list of HWC uint8 numpy arrays."""
+    frames = []
+    with imageio.get_reader(video_path) as reader:
+        for frame in reader:
+            frames.append(frame)
+    return frames
+def write_video_mp4(frames: list[np.ndarray], fps: float, out_path: str) -> str:
+    """Write uint8 HWC frames to mp4."""
+    with imageio.get_writer(out_path, fps=fps, macro_block_size=1) as writer:
+        for fr in frames:
+            writer.append_data(fr)
+    return out_path
+def get_video_fps(video_path: str) -> float:
+    """Get video FPS via ffprobe."""
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "v:0",
+             "-show_entries", "stream=r_frame_rate", "-of", "default=nw=1:nk=1",
+             str(video_path)],
+            capture_output=True, text=True,
+        )
+        num, den = result.stdout.strip().split("/")
+        return float(num) / float(den)
+    except Exception:
+        return 24.0
+def get_video_dimensions(video_path: str) -> tuple[int, int]:
+    """Return (width, height) of video."""
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "v:0",
+             "-show_entries", "stream=width,height", "-of", "csv=s=x:p=0",
+             str(video_path)],
+            capture_output=True, text=True,
+        )
+        parts = result.stdout.strip().split("x")
+        return int(parts[0]), int(parts[1])
+    except Exception:
+        return 768, 512
+def apply_gamma(frame: np.ndarray, gamma: float) -> np.ndarray:
+    """Apply gamma correction to a uint8 frame. Returns uint8."""
+    # Normalize to [0,1], apply gamma, back to uint8
+    f = frame.astype(np.float32) / 255.0
+    f = np.power(f, 1.0 / gamma)  # gamma 2.0 => exponent 0.5 => brightens
+    return (np.clip(f, 0.0, 1.0) * 255).astype(np.uint8)
+def apply_inverse_gamma(frame: np.ndarray, gamma: float) -> np.ndarray:
+    """Apply inverse gamma (darken back). gamma=2.0 forward => gamma=0.5 inverse => exponent 2.0"""
+    f = frame.astype(np.float32) / 255.0
+    f = np.power(f, gamma)  # gamma 2.0 => exponent 2.0 => darkens
+    return (np.clip(f, 0.0, 1.0) * 255).astype(np.uint8)
+def compute_letterbox_params(
+    src_w: int, src_h: int, target_w: int, target_h: int
+) -> tuple[int, int, int, int]:
+    """
+    Compute padding to place src in the center of target canvas.
+    Returns (pad_top, pad_bottom, pad_left, pad_right).
+    Source is scaled to fit inside target while maintaining aspect ratio,
+    then centered with black bars.
+    """
+    src_aspect = src_w / src_h
+    target_aspect = target_w / target_h
+    if src_aspect > target_aspect:
+        # Source is wider — fit to width, pad top/bottom
+        new_w = target_w
+        new_h = int(round(target_w / src_aspect))
+    else:
+        # Source is taller — fit to height, pad left/right
+        new_h = target_h
+        new_w = int(round(target_h * src_aspect))
+    pad_top = (target_h - new_h) // 2
+    pad_bottom = target_h - new_h - pad_top
+    pad_left = (target_w - new_w) // 2
+    pad_right = target_w - new_w - pad_left
+    return pad_top, pad_bottom, pad_left, pad_right, new_w, new_h
+def letterbox_frame(frame: np.ndarray, target_w: int, target_h: int) -> np.ndarray:
+    """Resize frame to fit inside target dimensions, pad with black (0,0,0)."""
+    src_h, src_w = frame.shape[:2]
+    pad_top, pad_bottom, pad_left, pad_right, new_w, new_h = compute_letterbox_params(
+        src_w, src_h, target_w, target_h
+    )
+    # Resize source to fit
+    resized = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_AREA)
+    # Create black canvas and paste
+    canvas = np.zeros((target_h, target_w, 3), dtype=np.uint8)
+    canvas[pad_top:pad_top + new_h, pad_left:pad_left + new_w] = resized
+    return canvas
+def letterbox_video(
+    video_path: str,
+    target_w: int,
+    target_h: int,
+    use_gamma: bool = False,
+    num_frames: int | None = None,
+    burnin_frames: int = 0,
+) -> tuple[str, str]:
+    """
+    Letterbox a video to target dimensions with black bars.
+    Optionally applies gamma 2.0 brightening for dark scenes.
+    burnin_frames: extra copies of the first frame prepended to give the
+    model time to fill the black regions before actual content starts.
+    Returns: (letterboxed_video_path, first_frame_preview_path)
+    """
+    frames = load_video_frames(video_path)
+    if not frames:
+        raise ValueError("No frames decoded from video")
+    fps = get_video_fps(video_path)
+    if num_frames is not None:
+        # Reserve space: we need num_frames of actual content + burn-in
+        frames = frames[:num_frames]
+    # Prepend burn-in copies of the first frame
+    if burnin_frames > 0:
+        frames = [frames[0]] * burnin_frames + frames
+    processed = []
+    for frame in frames:
+        lb = letterbox_frame(frame, target_w, target_h)
+        if use_gamma:
+            lb = apply_gamma(lb, gamma=2.0)
+        processed.append(lb)
+    # Save letterboxed video
+    out_path = tempfile.mktemp(suffix=".mp4")
+    write_video_mp4(processed, fps=fps, out_path=out_path)
+    # Preview is the first real content frame (after burn-in)
+    preview_path = tempfile.mktemp(suffix=".png")
+    Image.fromarray(processed[min(burnin_frames, len(processed) - 1)]).save(preview_path)
+    return out_path, preview_path
+def apply_inverse_gamma_to_video(video_path: str) -> str:
+    """Apply inverse gamma 0.5 to all frames of a video (undo the gamma 2.0 brightening)."""
+    frames = load_video_frames(video_path)
+    fps = get_video_fps(video_path)
+    corrected = []
+    for frame in frames:
+        corrected.append(apply_inverse_gamma(frame, gamma=2.0))
+    out_path = tempfile.mktemp(suffix=".mp4")
+    write_video_mp4(corrected, fps=fps, out_path=out_path)
+    return out_path
+def trim_video_start(video_path: str, trim_frames: int, frame_rate: float) -> str:
+    """
+    Trim the first N frames (and matching audio) from the output.
+    Since we prepended silence to the audio matching the burn-in duration,
+    trimming both video and audio by the same amount removes the burn-in
+    video frames AND the silence, leaving everything in sync.
+    """
+    if trim_frames <= 0:
+        return video_path
+    trim_seconds = trim_frames / frame_rate
+    out_path = tempfile.mktemp(suffix=".mp4")
+    subprocess.run(
+        ["ffmpeg", "-y", "-v", "error",
+         "-ss", f"{trim_seconds:.4f}",
+         "-i", video_path,
+         "-c:v", "libx264", "-crf", "18", "-preset", "fast",
+         "-c:a", "aac",
+         out_path],
+        check=True,
+    )
+    return out_path
+# ─────────────────────────────────────────────────────────────────────────────
+# Helper: read reference downscale factor from IC-LoRA metadata
+# ─────────────────────────────────────────────────────────────────────────────
+def _read_lora_reference_downscale_factor(lora_path: str) -> int:
+    try:
+        with safe_open(lora_path, framework="pt") as f:
+            metadata = f.metadata() or {}
+            return int(metadata.get("reference_downscale_factor", 1))
+    except Exception as e:
+        logging.warning(f"Failed to read metadata from LoRA file '{lora_path}': {e}")
+        return 1
+# ─────────────────────────────────────────────────────────────────────────────
+# Unified Pipeline: Distilled + Audio + IC-LoRA Video-to-Video
+# ─────────────────────────────────────────────────────────────────────────────
+class LTX23OutpaintPipeline:
+    """
+    LTX-2.3 pipeline for outpainting using IC-LoRA.
+    The outpaint LoRA is loaded separately (not fused), so:
+      - stage_1_model_ledger: base transformer + outpaint LoRA (Stage 1)
+      - stage_2_model_ledger: base transformer WITHOUT LoRA (Stage 2 upsampling)
+    """
+    def __init__(
+        self,
+        distilled_checkpoint_path: str,
+        spatial_upsampler_path: str,
+        gemma_root: str,
+        ic_loras: list[LoraPathStrengthAndSDOps] | None = None,
+        device: torch.device | None = None,
+        quantization: QuantizationPolicy | None = None,
+        stage_1_quantization: QuantizationPolicy | None = None,
+        reference_downscale_factor: int | None = None,
+    ):
+        self.device = device or get_device()
+        self.dtype = torch.bfloat16
+        ic_loras = ic_loras or []
+        self.has_ic_lora = len(ic_loras) > 0
+        # Stage 1 quantization: use stage_1_quantization if provided,
+        # otherwise fall back to the shared quantization policy.
+        # On ZeroGPU, fp8_cast LoRA fusion requires CUDA at init time,
+        # so we typically pass None for Stage 1 (with LoRA) to avoid the issue.
+        s1_quant = stage_1_quantization if stage_1_quantization is not None else quantization
+        # Stage 1: transformer with IC-LoRA (outpaint) — no fp8 quant to
+        # avoid Triton CUDA kernel during LoRA fusion at startup
+        self.stage_1_model_ledger = ModelLedger(
+            dtype=self.dtype,
+            device=self.device,
+            checkpoint_path=distilled_checkpoint_path,
+            spatial_upsampler_path=spatial_upsampler_path,
+            gemma_root_path=gemma_root,
+            loras=ic_loras,
+            quantization=s1_quant,
+        )
+        if self.has_ic_lora:
+            # Stage 2 needs a separate transformer WITHOUT IC-LoRA
+            # Can safely use fp8_cast here since no LoRA fusion is involved
+            self.stage_2_model_ledger = ModelLedger(
+                dtype=self.dtype,
+                device=self.device,
+                checkpoint_path=distilled_checkpoint_path,
+                spatial_upsampler_path=spatial_upsampler_path,
+                gemma_root_path=gemma_root,
+                loras=[],
+                quantization=quantization,
+            )
+        else:
+            self.stage_2_model_ledger = self.stage_1_model_ledger
+        self.pipeline_components = PipelineComponents(
+            dtype=self.dtype,
+            device=self.device,
+        )
+        # Reference downscale factor
+        if reference_downscale_factor is not None:
+            self.reference_downscale_factor = reference_downscale_factor
+        else:
+            self.reference_downscale_factor = 1
+            for lora in ic_loras:
+                scale = _read_lora_reference_downscale_factor(lora.path)
+                if scale != 1:
+                    if self.reference_downscale_factor not in (1, scale):
+                        raise ValueError(
+                            f"Conflicting reference_downscale_factor: "
+                            f"already {self.reference_downscale_factor}, got {scale}"
+                        )
+                    self.reference_downscale_factor = scale
+        logging.info(f"[Pipeline] reference_downscale_factor={self.reference_downscale_factor}")
+    # ── Video reference conditioning (IC-LoRA) ─────────────────────────────
+    def _create_ic_conditionings(
+        self,
+        video_conditioning: list[tuple[str, float]],
+        height: int,
+        width: int,
+        num_frames: int,
+        video_encoder: VideoEncoder,
+        conditioning_strength: float = 1.0,
+    ) -> list[ConditioningItem]:
+        """Create IC-LoRA video reference conditioning items."""
+        conditionings: list[ConditioningItem] = []
+        scale = self.reference_downscale_factor
+        ref_height = height // scale
+        ref_width = width // scale
+        for video_path, strength in video_conditioning:
+            video = load_video_conditioning(
+                video_path=video_path,
+                height=ref_height,
+                width=ref_width,
+                frame_cap=num_frames,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            encoded_video = video_encoder(video)
+            cond = VideoConditionByReferenceLatent(
+                latent=encoded_video,
+                downscale_factor=scale,
+                strength=strength,
+            )
+            if conditioning_strength < 1.0:
+                cond = ConditioningItemAttentionStrengthWrapper(
+                    cond, attention_mask=conditioning_strength
+                )
+            conditionings.append(cond)
+        if conditionings:
+            logging.info(f"[IC-LoRA] Added {len(conditionings)} video conditioning(s)")
+        return conditionings
+    # ── Main generation entry point ──────────────────────────────────────
+    def __call__(
+        self,
+        prompt: str,
+        seed: int,
+        height: int,
+        width: int,
+        num_frames: int,
+        frame_rate: float,
+        images: list[ImageConditioningInput],
+        audio_path: str | None = None,
+        video_conditioning: list[tuple[str, float]] | None = None,
+        tiling_config: TilingConfig | None = None,
+        enhance_prompt: bool = False,
+        conditioning_strength: float = 1.0,
+    ):
+        """
+        Generate outpainted video.
+        The video_conditioning should contain the letterboxed video (with black bars).
+        """
+        assert_resolution(height=height, width=width, is_two_stage=True)
+        has_audio = audio_path is not None
+        has_video_cond = bool(video_conditioning)
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        dtype = torch.bfloat16
+        # ── Encode text prompt ───────────────────────────────────────────
+        (ctx_p,) = encode_prompts(
+            [prompt],
+            self.stage_1_model_ledger,
+            enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0].path if len(images) > 0 else None,
+        )
+        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        # ── Encode external audio (if provided) ─────────────────────────
+        encoded_audio_latent = None
+        decoded_audio_for_output = None
+        if has_audio:
+            video_duration = num_frames / frame_rate
+            decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+            if decoded_audio is None:
+                raise ValueError(f"Could not extract audio stream from {audio_path}")
+            encoded_audio_latent = vae_encode_audio(
+                decoded_audio, self.stage_1_model_ledger.audio_encoder()
+            )
+            audio_shape = AudioLatentShape.from_duration(
+                batch=1, duration=video_duration, channels=8, mel_bins=16
+            )
+            expected_frames = audio_shape.frames
+            actual_frames = encoded_audio_latent.shape[2]
+            if actual_frames > expected_frames:
+                encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+            elif actual_frames < expected_frames:
+                pad = torch.zeros(
+                    encoded_audio_latent.shape[0], encoded_audio_latent.shape[1],
+                    expected_frames - actual_frames, encoded_audio_latent.shape[3],
+                    device=encoded_audio_latent.device, dtype=encoded_audio_latent.dtype,
+                )
+                encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+            decoded_audio_for_output = Audio(
+                waveform=decoded_audio.waveform.squeeze(0),
+                sampling_rate=decoded_audio.sampling_rate,
+            )
+        # ── Build conditionings for Stage 1 ──────────────────────────────
+        video_encoder = self.stage_1_model_ledger.video_encoder()
+        stage_1_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames,
+            width=width // 2, height=height // 2, fps=frame_rate,
+        )
+        # Image conditionings (first frame of letterboxed video)
+        stage_1_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        # IC-LoRA video reference conditionings (the letterboxed video)
+        if has_video_cond:
+            ic_conds = self._create_ic_conditionings(
+                video_conditioning=video_conditioning,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                num_frames=num_frames,
+                video_encoder=video_encoder,
+                conditioning_strength=conditioning_strength,
+            )
+            stage_1_conditionings.extend(ic_conds)
+        # ── Stage 1: Low-res generation ──────────────────────────────────
+        transformer = self.stage_1_model_ledger.transformer()
+        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                ),
+            )
+        if has_audio:
+            video_state = denoise_video_only(
+                output_shape=stage_1_output_shape,
+                conditionings=stage_1_conditionings,
+                noiser=noiser,
+                sigmas=stage_1_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                initial_audio_latent=encoded_audio_latent,
+            )
+            audio_state = None
+        else:
+            video_state, audio_state = denoise_audio_video(
+                output_shape=stage_1_output_shape,
+                conditionings=stage_1_conditionings,
+                noiser=noiser,
+                sigmas=stage_1_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+            )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        # ── Stage 2: Upsample + Refine ────���─────────────────────────────
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
+        )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        # Stage 2 uses the transformer WITHOUT IC-LoRA
+        transformer_s2 = self.stage_2_model_ledger.transformer()
+        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop_s2(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer_s2,
+                ),
+            )
+        stage_2_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames,
+            width=width, height=height, fps=frame_rate,
+        )
+        stage_2_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        if has_audio:
+            video_state = denoise_video_only(
+                output_shape=stage_2_output_shape,
+                conditionings=stage_2_conditionings,
+                noiser=noiser,
+                sigmas=stage_2_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop_s2,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                noise_scale=stage_2_sigmas[0],
+                initial_video_latent=upscaled_video_latent,
+                initial_audio_latent=encoded_audio_latent,
+            )
+            audio_state = None
+        else:
+            video_state, audio_state = denoise_audio_video(
+                output_shape=stage_2_output_shape,
+                conditionings=stage_2_conditionings,
+                noiser=noiser,
+                sigmas=stage_2_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop_s2,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                noise_scale=stage_2_sigmas[0],
+                initial_video_latent=upscaled_video_latent,
+                initial_audio_latent=audio_state.latent,
+            )
+        torch.cuda.synchronize()
+        del transformer, transformer_s2, video_encoder
+        cleanup_memory()
+        # ── Decode ───────────────────────────────────────────────────────
+        decoded_video = vae_decode_video(
+            video_state.latent,
+            self.stage_2_model_ledger.video_decoder(),
+            tiling_config,
+            generator,
+        )
+        if has_audio:
+            output_audio = decoded_audio_for_output
+        else:
+            output_audio = vae_decode_audio(
+                audio_state.latent,
+                self.stage_2_model_ledger.audio_decoder(),
+                self.stage_2_model_ledger.vocoder(),
+            )
+        return decoded_video, output_audio
+# ─────────────────────────────────────────────────────────────────────────────
+# Constants
+# ─────────────────────────────────────────────────────────────────────────────
+MAX_SEED = np.iinfo(np.int32).max
+DEFAULT_FRAME_RATE = 24.0
+# Output resolutions for outpainting (the expanded canvas)
+RESOLUTIONS = {
+    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024),
+             "4:3": (1536, 1152), "3:4": (1152, 1536), "21:9": (1536, 768)},
+    "low":  {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768),
+             "4:3": (768, 576), "3:4": (576, 768), "21:9": (768, 384)},
+}
+# Outpaint fused checkpoint (base + LoRA pre-merged)
+FUSED_CHECKPOINT_REPO = "linoyts/ltx-2.3-22b-fused-outpaint"
+FUSED_CHECKPOINT_FILENAME = "ltx-2.3-22b-fused-outpaint.safetensors"
+# ─────────────────────────────────────────────────────────────────────────────
+# Download Models
+# ──────��──────────────────────────────────────────────────────────────────────
+LTX_MODEL_REPO = "Lightricks/LTX-2.3"
+GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+print("=" * 80)
+print("Downloading LTX-2.3 fused outpaint model + Gemma...")
+print("=" * 80)
+# Fused checkpoint: base distilled + outpaint LoRA already merged
+checkpoint_path = hf_hub_download(
+    repo_id=FUSED_CHECKPOINT_REPO, filename=FUSED_CHECKPOINT_FILENAME
+)
+spatial_upsampler_path = hf_hub_download(
+    repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
+)
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+print(f"Checkpoint (fused): {checkpoint_path}")
+print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Gemma root: {gemma_root}")
+# ─────────────────────────────────────────────────────────────────────────────
+# Initialize Pipeline
+# ─────────────────────────────────────────────────────────────────────────────
+pipeline = LTX23OutpaintPipeline(
+    distilled_checkpoint_path=checkpoint_path,
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
+    # ic_loras=[]  — LoRA already fused into checkpoint
+    quantization=QuantizationPolicy.fp8_cast(),
+    # Outpaint IC-LoRA reference_downscale_factor: read from the LoRA metadata
+    # it was 1 for outpaint, but set explicitly in case
+    reference_downscale_factor=1,
+)
+# Preload all models for ZeroGPU tensor packing.
+print("Preloading all models...")
+_ledger_1 = pipeline.stage_1_model_ledger
+_ledger_2 = pipeline.stage_2_model_ledger
+_shared = _ledger_1 is _ledger_2
+# Stage 1 models (with outpaint LoRA)
+_s1_transformer = _ledger_1.transformer()
+_s1_video_encoder = _ledger_1.video_encoder()
+_s1_text_encoder = _ledger_1.text_encoder()
+_s1_embeddings = _ledger_1.gemma_embeddings_processor()
+_s1_audio_encoder = _ledger_1.audio_encoder()
+_ledger_1.transformer = lambda: _s1_transformer
+_ledger_1.video_encoder = lambda: _s1_video_encoder
+_ledger_1.text_encoder = lambda: _s1_text_encoder
+_ledger_1.gemma_embeddings_processor = lambda: _s1_embeddings
+_ledger_1.audio_encoder = lambda: _s1_audio_encoder
+if _shared:
+    _video_decoder = _ledger_1.video_decoder()
+    _audio_decoder = _ledger_1.audio_decoder()
+    _vocoder = _ledger_1.vocoder()
+    _spatial_upsampler = _ledger_1.spatial_upsampler()
+    _ledger_1.video_decoder = lambda: _video_decoder
+    _ledger_1.audio_decoder = lambda: _audio_decoder
+    _ledger_1.vocoder = lambda: _vocoder
+    _ledger_1.spatial_upsampler = lambda: _spatial_upsampler
+    print("  (single shared ledger — no IC-LoRA)")
+else:
+    # Stage 2 models (separate transformer without IC-LoRA)
+    _s2_transformer = _ledger_2.transformer()
+    _s2_video_encoder = _ledger_2.video_encoder()
+    _s2_video_decoder = _ledger_2.video_decoder()
+    _s2_audio_decoder = _ledger_2.audio_decoder()
+    _s2_vocoder = _ledger_2.vocoder()
+    _s2_spatial_upsampler = _ledger_2.spatial_upsampler()
+    _s2_text_encoder = _ledger_2.text_encoder()
+    _s2_embeddings = _ledger_2.gemma_embeddings_processor()
+    _s2_audio_encoder = _ledger_2.audio_encoder()
+    _ledger_2.transformer = lambda: _s2_transformer
+    _ledger_2.video_encoder = lambda: _s2_video_encoder
+    _ledger_2.video_decoder = lambda: _s2_video_decoder
+    _ledger_2.audio_decoder = lambda: _s2_audio_decoder
+    _ledger_2.vocoder = lambda: _s2_vocoder
+    _ledger_2.spatial_upsampler = lambda: _s2_spatial_upsampler
+    _ledger_2.text_encoder = lambda: _s2_text_encoder
+    _ledger_2.gemma_embeddings_processor = lambda: _s2_embeddings
+    _ledger_2.audio_encoder = lambda: _s2_audio_encoder
+    print("  (two separate ledgers — IC-LoRA active)")
+print("All models preloaded!")
+print("=" * 80)
+# ─────────────────────────────────────────────────────────────────────────────
+# UI Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+def detect_aspect_ratio(media_path) -> str:
+    """Detect the closest aspect ratio from a video."""
+    if media_path is None:
+        return "16:9"
+    try:
+        w, h = get_video_dimensions(str(media_path))
+    except Exception:
+        return "16:9"
+    ratio = w / h
+    candidates = {
+        "16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0,
+        "4:3": 4 / 3, "3:4": 3 / 4, "21:9": 21 / 9,
+    }
+    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def _get_video_duration(video_path) -> float | None:
+    """Get video duration in seconds via ffprobe."""
+    if video_path is None:
+        return None
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "v:0",
+             "-show_entries", "format=duration", "-of", "default=nw=1:nk=1",
+             str(video_path)],
+            capture_output=True, text=True,
+        )
+        return float(result.stdout.strip())
+    except Exception:
+        return None
+def on_video_upload(video, high_res):
+    """Auto-set duration when video is uploaded."""
+    vid_dur = _get_video_duration(video)
+    if vid_dur is not None:
+        dur = round(min(vid_dur, 15.0), 1)
+    else:
+        dur = 3.0
+    return gr.update(value=dur)
+def get_target_resolution(target_aspect: str, high_res: bool) -> tuple[int, int]:
+    """Get the target output resolution for the selected aspect ratio."""
+    tier = "high" if high_res else "low"
+    return RESOLUTIONS[tier].get(target_aspect, RESOLUTIONS[tier]["16:9"])
+def preview_letterbox(video, target_aspect, high_res, use_gamma):
+    """Generate a preview of the letterboxed first frame."""
+    if video is None:
+        return None, gr.update(), gr.update()
+    target_w, target_h = get_target_resolution(target_aspect, high_res)
+    # Load first frame only for preview
+    frames = load_video_frames(str(video))
+    if not frames:
+        return None, gr.update(value=target_w), gr.update(value=target_h)
+    frame = letterbox_frame(frames[0], target_w, target_h)
+    if use_gamma:
+        frame = apply_gamma(frame, gamma=2.0)
+    preview_path = tempfile.mktemp(suffix=".png")
+    Image.fromarray(frame).save(preview_path)
+    return preview_path, gr.update(value=target_w), gr.update(value=target_h)
+# ─────────────────────────────────────────────────────────────────────────────
+# Audio extraction
+# ─────────────────────────────────────────────────────────────────────────────
+def _extract_audio_from_video(video_path: str) -> str | None:
+    """Extract audio from video as a temp WAV file. Returns None if no audio."""
+    out_path = tempfile.mktemp(suffix=".wav")
+    try:
+        probe = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "a:0",
+             "-show_entries", "stream=codec_type", "-of", "default=nw=1:nk=1",
+             video_path],
+            capture_output=True, text=True,
+        )
+        if not probe.stdout.strip():
+            return None
+        subprocess.run(
+            ["ffmpeg", "-y", "-v", "error", "-i", video_path,
+             "-vn", "-ac", "2", "-ar", "48000", "-c:a", "pcm_s16le", out_path],
+            check=True,
+        )
+        return out_path
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return None
+def _prepend_silence_to_audio(audio_path: str, silence_duration: float) -> str:
+    """Prepend silence to an audio file so it starts later in the timeline.
+    This aligns audio with the real content when burn-in frames are prepended to video."""
+    if silence_duration <= 0:
+        return audio_path
+    out_path = tempfile.mktemp(suffix=".wav")
+    # Generate silence then concatenate with original audio
+    subprocess.run(
+        ["ffmpeg", "-y", "-v", "error",
+         "-f", "lavfi", "-i", f"anullsrc=r=48000:cl=stereo:d={silence_duration:.4f}",
+         "-i", audio_path,
+         "-filter_complex", "[0:a][1:a]concat=n=2:v=0:a=1[out]",
+         "-map", "[out]",
+         "-ac", "2", "-ar", "48000", "-c:a", "pcm_s16le",
+         out_path],
+        check=True,
+    )
+    return out_path
+def _mux_audio_to_video(video_path: str, audio_path: str) -> str:
+    """Mux an external audio track into a video, trimming to the shorter of the two."""
+    out_path = tempfile.mktemp(suffix=".mp4")
+    subprocess.run(
+        ["ffmpeg", "-y", "-v", "error",
+         "-i", video_path,
+         "-i", audio_path,
+         "-c:v", "copy",
+         "-c:a", "aac",
+         "-map", "0:v:0", "-map", "1:a:0",
+         "-shortest",
+         out_path],
+        check=True,
+    )
+    return out_path
+# ─────────────────────────────────────────────────────────────────────────────
+# Generation
+# ─────────────────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=120)
+@torch.inference_mode()
+def generate_video(
+    input_video,
+    prompt: str = "",
+    duration: float = 3,
+    target_aspect: str = "16:9",
+    conditioning_strength: float = 1.0,
+    enhance_prompt: bool = True,
+    use_gamma: bool = False,
+    use_video_audio: bool = True,
+    seed: int = 42,
+    randomize_seed: bool = True,
+    high_res: bool = False,
+    input_audio=None,
+    progress=gr.Progress(track_tqdm=True),
+):
+    try:
+        torch.cuda.reset_peak_memory_stats()
+        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+        if input_video is None:
+            raise ValueError("Please upload a source video to outpaint.")
+        video_path = str(input_video)
+        frame_rate = DEFAULT_FRAME_RATE
+        # Burn-in: prepend extra frames of the first frame so the model
+        # has time to fill the black regions before actual content starts.
+        # These will be trimmed from the final output.
+        BURNIN_FRAMES = 24  # ~1 second at 24fps
+        # Total frames to generate includes burn-in
+        content_frames = int(duration * frame_rate) + 1
+        content_frames = ((content_frames - 1 + 7) // 8) * 8 + 1
+        total_frames = content_frames + BURNIN_FRAMES
+        # Re-align to multiple of 8 + 1
+        total_frames = ((total_frames - 1 + 7) // 8) * 8 + 1
+        # Actual burn-in count after alignment (may differ slightly)
+        actual_burnin = total_frames - content_frames
+        # Get target resolution
+        target_w, target_h = get_target_resolution(target_aspect, high_res)
+        print(f"[Outpaint] Generating: {target_h}x{target_w}, {total_frames} frames "
+              f"(content={content_frames}, burnin={actual_burnin}), "
+              f"seed={current_seed}, gamma={use_gamma}, "
+              f"target_aspect={target_aspect}")
+        # Step 1: Letterbox the input video with black bars + burn-in frames
+        letterboxed_path, first_frame_path = letterbox_video(
+            video_path=video_path,
+            target_w=target_w,
+            target_h=target_h,
+            use_gamma=use_gamma,
+            num_frames=content_frames,
+            burnin_frames=actual_burnin,
+        )
+        print(f"[Outpaint] Letterboxed video saved to {letterboxed_path}")
+        # Build image conditioning from letterboxed first frame
+        images = [ImageConditioningInput(path=first_frame_path, frame_idx=0, strength=1.0)]
+        # Build video conditioning — the letterboxed video IS the conditioning
+        video_conditioning = [(letterboxed_path, 1.0)]
+        # Extract original audio — we'll mux it back at the end untouched,
+        # NOT through the pipeline's audio VAE which would introduce artifacts.
+        original_audio_path = None
+        if input_audio is not None:
+            original_audio_path = str(input_audio)
+        elif use_video_audio:
+            original_audio_path = _extract_audio_from_video(video_path)
+            if original_audio_path:
+                print(f"[Outpaint] Extracted audio from input video (will mux at end)")
+        tiling_config = TilingConfig.default()
+        video_chunks_number = get_video_chunks_number(total_frames, tiling_config)
+        # Generate video WITHOUT audio — audio will be muxed in post
+        video, audio = pipeline(
+            prompt=prompt,
+            seed=current_seed,
+            height=int(target_h),
+            width=int(target_w),
+            num_frames=total_frames,
+            frame_rate=frame_rate,
+            images=images,
+            audio_path=None,  # no audio through pipeline
+            video_conditioning=video_conditioning,
+            tiling_config=tiling_config,
+            enhance_prompt=enhance_prompt,
+            conditioning_strength=conditioning_strength,
+        )
+        output_path = tempfile.mktemp(suffix=".mp4")
+        encode_video(
+            video=video,
+            fps=frame_rate,
+            audio=audio,
+            output_path=output_path,
+            video_chunks_number=video_chunks_number,
+        )
+        # Step 2: If gamma was used, apply inverse gamma to the final output
+        if use_gamma:
+            print("[Outpaint] Applying inverse gamma correction to output...")
+            output_path = apply_inverse_gamma_to_video(output_path)
+        # Step 3: Trim burn-in frames from the start (video-only at this point)
+        if actual_burnin > 0:
+            print(f"[Outpaint] Trimming {actual_burnin} burn-in frames from output...")
+            output_path = trim_video_start(output_path, actual_burnin, frame_rate)
+        # Step 4: Mux the original untouched audio back in
+        if original_audio_path is not None:
+            print("[Outpaint] Muxing original audio into output...")
+            output_path = _mux_audio_to_video(output_path, original_audio_path)
+        return str(output_path), current_seed
+    except Exception as e:
+        import traceback
+        print(f"Error: {str(e)}\n{traceback.format_exc()}")
+        return None, current_seed
+# ─────────────────────────────────────────────────────────────────────────────
+# Gradio UI — LTX 2.3 Outpaint
+# ─────────────────────────────────────────────────────────────────────────────
+css = """
+.main-title { text-align: center; margin-bottom: 0.5em; }
+.generate-btn { min-height: 52px !important; font-size: 1.1em !important; }
+footer { display: none !important; }
+video { object-fit: contain !important; }
+.preview-frame img { max-height: 300px !important; object-fit: contain !important; }
+"""
+purple_citrus = gr.themes.Citrus(
+    primary_hue=gr.themes.colors.purple,
+    secondary_hue=gr.themes.colors.purple,
+    neutral_hue=gr.themes.colors.gray,
+)
+with gr.Blocks(title="LTX 2.3 Outpaint", css=css, theme=purple_citrus) as demo:
+    gr.Markdown("""
+# LTX 2.3 Outpaint: Extend Your Video to Any Aspect Ratio 🖼️
+Expand video beyond its original frame with visually and temporally consistent content using [LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3) + [Outpaint IC-LoRA](https://huggingface.co/oumoumad/LTX-2.3-22b-IC-LoRA-Outpaint) by [@oumoumad](https://huggingface.co/oumoumad) ✨
+**Tip:** For dark/night scenes, enable **Gamma Correction** (Advanced Settings) so the model can distinguish dark content from the black sentinel bars.
+    """)
+    with gr.Row():
+        # ── Left column: inputs ──────────────────────────────────────
+        with gr.Column(scale=1):
+            input_video = gr.Video(label="Source Video")
+            with gr.Row():
+                target_aspect = gr.Dropdown(
+                    label="Expand to Aspect Ratio",
+                    choices=["16:9", "9:16", "1:1", "4:3", "3:4", "21:9"],
+                    value="16:9",
+                    info="The target canvas shape — black bars will fill the new area",
+                )
+                duration = gr.Slider(
+                    label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
+                )
+            prompt = gr.Textbox(
+                label="Prompt (optional)",
+                info="Describe the video + what should appear in the expanded regions",
+                lines=2,
+                placeholder="a wide landscape with mountains and a clear sky",
+            )
+            with gr.Row():
+                preview_btn = gr.Button("Preview Letterbox", variant="secondary")
+                generate_btn = gr.Button(
+                    "Generate Outpaint", variant="primary", size="lg",
+                    elem_classes=["generate-btn"],
+                )
+            with gr.Accordion("Letterbox Preview", open=True):
+                preview_image = gr.Image(
+                    label="Letterboxed first frame (black = regions to generate)",
+                    type="filepath",
+                    elem_classes=["preview-frame"],
+                    interactive=False,
+                )
+            with gr.Accordion("Advanced Settings", open=False):
+                enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
+                conditioning_strength = gr.Slider(
+                    label="Conditioning Strength",
+                    info="How strongly the original video content influences generation",
+                    minimum=0.0, maximum=1.0, value=1.0, step=0.05,
+                )
+                use_gamma = gr.Checkbox(
+                    label="Gamma Correction (for dark scenes)",
+                    value=False,
+                    info="Apply gamma 2.0 brightening before generation and inverse after — "
+                         "recommended for dark/night footage where black bars may be confused "
+                         "with dark scene content",
+                )
+                high_res = gr.Checkbox(label="High Resolution (2×)", value=False)
+                use_video_audio = gr.Checkbox(
+                    label="Preserve Audio from Source Video", value=True,
+                    info="Extract and keep the audio track from the source video",
+                )
+                input_audio = gr.Audio(
+                    label="Override Audio (optional — replaces video audio)",
+                    type="filepath",
+                )
+                seed = gr.Slider(
+                    label="Seed", minimum=0, maximum=MAX_SEED, value=42, step=1,
+                )
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                with gr.Row():
+                    width_display = gr.Number(label="Output Width", interactive=False)
+                    height_display = gr.Number(label="Output Height", interactive=False)
+        # ── Right column: output ─────────────────────────────────────
+        with gr.Column(scale=1):
+            output_video = gr.Video(label="Outpainted Result", autoplay=True, height=480)
+    # ── Event handlers ───────────────────────────────────────────────────
+    input_video.change(
+        fn=on_video_upload,
+        inputs=[input_video, high_res],
+        outputs=[duration],
+    )
+    # Auto-preview when video or settings change
+    preview_btn.click(
+        fn=preview_letterbox,
+        inputs=[input_video, target_aspect, high_res, use_gamma],
+        outputs=[preview_image, width_display, height_display],
+    )
+    # Also auto-preview when aspect ratio or gamma changes
+    target_aspect.change(
+        fn=preview_letterbox,
+        inputs=[input_video, target_aspect, high_res, use_gamma],
+        outputs=[preview_image, width_display, height_display],
+    )
+    use_gamma.change(
+        fn=preview_letterbox,
+        inputs=[input_video, target_aspect, high_res, use_gamma],
+        outputs=[preview_image, width_display, height_display],
+    )
+    high_res.change(
+        fn=preview_letterbox,
+        inputs=[input_video, target_aspect, high_res, use_gamma],
+        outputs=[preview_image, width_display, height_display],
+    )
+    # Auto-preview on video upload too
+    input_video.change(
+        fn=preview_letterbox,
+        inputs=[input_video, target_aspect, high_res, use_gamma],
+        outputs=[preview_image, width_display, height_display],
+    )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            input_video, prompt, duration, target_aspect,
+            conditioning_strength, enhance_prompt, use_gamma,
+            use_video_audio, seed, randomize_seed, high_res, input_audio,
+        ],
+        outputs=[output_video, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()

appsync.py ADDED Viewed

	@@ -0,0 +1,1317 @@

+import os
+import subprocess
+import sys
+# Disable torch.compile / dynamo before any torch import
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+# Install xformers for memory-efficient attention
+subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
+# Install video preprocessing dependencies
+subprocess.run([sys.executable, "-m", "pip", "install",
+                "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image",
+                "opencv-python-headless", "decord", "num2words"], check=False)
+# Ensure num2words is installed (required by SmolVLMProcessor)
+subprocess.run([sys.executable, "-m", "pip", "install", "num2words"], check=True)
+# Reinstall torchaudio to match the torch CUDA version on this space.
+# controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
+# with the pre-installed CUDA torch, causing "undefined symbol" errors.
+_tv = subprocess.run([sys.executable, "-c", "import torch; print(torch.__version__)"],
+                     capture_output=True, text=True)
+if _tv.returncode == 0:
+    _full_ver = _tv.stdout.strip()
+    # Extract CUDA suffix if present (e.g. "2.7.0+cu124" -> "cu124")
+    _cuda_suffix = _full_ver.split("+")[-1] if "+" in _full_ver else "cu124"
+    _base_ver = _full_ver.split("+")[0]
+    print(f"Detected torch {_full_ver}, reinstalling matching torchaudio...")
+    subprocess.run([
+        sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps",
+        f"torchaudio=={_base_ver}",
+        "--index-url", f"https://download.pytorch.org/whl/{_cuda_suffix}",
+    ], check=False)
+# Clone LTX-2 repo at a pinned commit and install packages
+LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
+LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
+LTX_COMMIT = "ae855f8538843825f9015a419cf4ba5edaf5eec2"
+# Re-clone cleanly to avoid keeping an incompatible previous checkout
+if os.path.exists(LTX_REPO_DIR):
+    print(f"Removing existing repo at {LTX_REPO_DIR}...")
+    subprocess.run(["rm", "-rf", LTX_REPO_DIR], check=True)
+print(f"Cloning {LTX_REPO_URL}...")
+subprocess.run(["git", "clone", LTX_REPO_URL, LTX_REPO_DIR], check=True)
+print(f"Checking out commit {LTX_COMMIT}...")
+subprocess.run(["git", "-C", LTX_REPO_DIR, "checkout", LTX_COMMIT], check=True)
+print("Installing ltx-core and ltx-pipelines from pinned repo commit...")
+subprocess.run(
+    [
+        sys.executable, "-m", "pip", "install",
+        "--force-reinstall", "--no-deps",
+        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
+        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines"),
+    ],
+    check=True,
+)
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
+import logging
+import random
+import tempfile
+from pathlib import Path
+import torch
+torch._dynamo.config.suppress_errors = True
+torch._dynamo.config.disable = True
+import spaces
+import gradio as gr
+import numpy as np
+from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors import safe_open
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.conditioning import (
+    ConditioningItem,
+    ConditioningItemAttentionStrengthWrapper,
+    VideoConditionByReferenceLatent,
+)
+from ltx_core.loader import LoraPathStrengthAndSDOps, LTXV_LORA_COMFY_RENAMING_MAP
+from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
+from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, VideoEncoder, get_video_chunks_number
+from ltx_core.model.video_vae import decode_video as vae_decode_video
+from ltx_core.quantization import QuantizationPolicy
+from ltx_core.types import Audio, AudioLatentShape, LatentState, VideoLatentShape, VideoPixelShape
+from ltx_pipelines.utils import ModelLedger, euler_denoising_loop
+from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.helpers import (
+    assert_resolution,
+    cleanup_memory,
+    combined_image_conditionings,
+    denoise_audio_video,
+    denoise_video_only,
+    encode_prompts,
+    get_device,
+    simple_denoising_func,
+)
+from ltx_pipelines.utils.media_io import (
+    decode_audio_from_file,
+    encode_video,
+    load_video_conditioning,
+)
+from ltx_pipelines.utils.types import PipelineComponents
+# Force-patch xformers attention into the LTX attention module.
+from ltx_core.model.transformer import attention as _attn_mod
+print(f"[ATTN] Before patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+try:
+    from xformers.ops import memory_efficient_attention as _mea
+    _attn_mod.memory_efficient_attention = _mea
+    print(f"[ATTN] After patch: memory_efficient_attention={_attn_mod.memory_efficient_attention}")
+except Exception as e:
+    print(f"[ATTN] xformers patch FAILED: {type(e).__name__}: {e}")
+logging.getLogger().setLevel(logging.INFO)
+# ─────────────────────────────────────────────────────────────────────────────
+# Video Preprocessing: Strip appearance, keep structure
+# ─────────────────────────────────────────────────────────────────────────────
+import imageio
+import cv2
+from PIL import Image
+from dwpose import DwposeDetector
+_pose_processor = None
+_depth_processor = None
+def _get_pose_processor():
+    global _pose_processor
+    if _pose_processor is None:
+        _pose_processor = DwposeDetector.from_pretrained_default()
+        print("[Preprocess] DWPose processor loaded")
+    return _pose_processor
+def _get_depth_processor():
+    """Placeholder — uses simple Laplacian edge-based depth approximation via OpenCV."""
+    global _depth_processor
+    if _depth_processor is None:
+        _depth_processor = "cv2"  # sentinel — we use cv2 directly
+        print("[Preprocess] CV2-based depth processor loaded")
+    return _depth_processor
+def load_video_frames(video_path: str) -> list[np.ndarray]:
+    """Load video frames as list of HWC uint8 numpy arrays."""
+    frames = []
+    with imageio.get_reader(video_path) as reader:
+        for frame in reader:
+            frames.append(frame)
+    return frames
+def write_video_mp4(frames_float_01: list[np.ndarray], fps: float, out_path: str) -> str:
+    """Write float [0,1] frames to mp4."""
+    frames_uint8 = [(f * 255).astype(np.uint8) for f in frames_float_01]
+    with imageio.get_writer(out_path, fps=fps, macro_block_size=1) as writer:
+        for fr in frames_uint8:
+            writer.append_data(fr)
+    return out_path
+def extract_first_frame(video_path: str) -> str:
+    """Extract first frame as a temp PNG file, return path."""
+    frames = load_video_frames(video_path)
+    if not frames:
+        raise ValueError("No frames in video")
+    out_path = tempfile.mktemp(suffix=".png")
+    Image.fromarray(frames[0]).save(out_path)
+    return out_path
+def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
+    """Extract DWPose skeletons from each frame. Returns float [0,1] frames."""
+    processor = _get_pose_processor()
+    result = []
+    for frame in frames:
+        pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
+        pose_img = processor(pil, include_body=True, include_hand=True, include_face=True)
+        if not isinstance(pose_img, Image.Image):
+            pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
+        pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
+        result.append(np.array(pose_img).astype(np.float32) / 255.0)
+    return result
+def preprocess_video_canny(frames: list[np.ndarray], width: int, height: int,
+                           low_threshold: int = 50, high_threshold: int = 100) -> list[np.ndarray]:
+    """Extract Canny edges from each frame. Returns float [0,1] frames."""
+    result = []
+    for frame in frames:
+        # Resize first
+        resized = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
+        gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY)
+        edges = cv2.Canny(gray, low_threshold, high_threshold)
+        # Convert single-channel to 3-channel
+        edges_3ch = np.stack([edges, edges, edges], axis=-1)
+        result.append(edges_3ch.astype(np.float32) / 255.0)
+    return result
+def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
+    """Estimate depth-like maps from each frame using Laplacian gradient magnitude.
+    This is a fast approximation — for true depth, use MiDaS externally."""
+    result = []
+    for frame in frames:
+        resized = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
+        gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY).astype(np.float32)
+        # Laplacian gives edge/gradient info that approximates depth discontinuities
+        lap = np.abs(cv2.Laplacian(gray, cv2.CV_32F, ksize=5))
+        # Normalize to [0, 1]
+        lap = lap / (lap.max() + 1e-8)
+        depth_3ch = np.stack([lap, lap, lap], axis=-1)
+        result.append(depth_3ch)
+    return result
+def preprocess_conditioning_video(
+    video_path: str,
+    mode: str,
+    width: int,
+    height: int,
+    num_frames: int,
+    fps: float,
+) -> tuple[str, str]:
+    """
+    Preprocess a video for conditioning. Strips appearance, keeps structure.
+    Returns:
+        (conditioning_mp4_path, first_frame_png_path)
+    """
+    frames = load_video_frames(video_path)
+    if not frames:
+        raise ValueError("No frames decoded from video")
+    # Trim to num_frames
+    frames = frames[:num_frames]
+    # Save first frame (original appearance) for image conditioning
+    first_png = tempfile.mktemp(suffix=".png")
+    Image.fromarray(frames[0]).save(first_png)
+    # Process based on mode
+    if mode == "Pose (DWPose)":
+        processed = preprocess_video_pose(frames, width, height)
+    elif mode == "Canny Edge":
+        processed = preprocess_video_canny(frames, width, height)
+    elif mode == "Depth (Laplacian)":
+        processed = preprocess_video_depth(frames, width, height)
+    else:
+        # "Raw" mode — no preprocessing
+        processed = [f.astype(np.float32) / 255.0 for f in frames]
+    cond_mp4 = tempfile.mktemp(suffix=".mp4")
+    write_video_mp4(processed, fps=fps, out_path=cond_mp4)
+    return cond_mp4, first_png
+# ─────────────────────────────────────────────────────────────────────────────
+# Helper: read reference downscale factor from IC-LoRA metadata
+# ─────────────────────────────────────────────────────────────────────────────
+def _read_lora_reference_downscale_factor(lora_path: str) -> int:
+    try:
+        with safe_open(lora_path, framework="pt") as f:
+            metadata = f.metadata() or {}
+            return int(metadata.get("reference_downscale_factor", 1))
+    except Exception as e:
+        logging.warning(f"Failed to read metadata from LoRA file '{lora_path}': {e}")
+        return 1
+# ─────────────────────────────────────────────────────────────────────────────
+# Unified Pipeline: Distilled + Audio + IC-LoRA Video-to-Video
+# ─────────────────────────────────────────────────────────────────────────────
+class LTX23UnifiedPipeline:
+    """
+    Unified LTX-2.3 pipeline supporting all generation modes:
+      • Text-to-Video
+      • Image-to-Video (first-frame conditioning)
+      • Audio-to-Video (lip-sync / BGM conditioning with external audio)
+      • Video-to-Video (IC-LoRA reference video conditioning)
+      • Any combination of the above
+    Architecture:
+      - stage_1_model_ledger: transformer WITH IC-LoRA fused (used for Stage 1)
+      - stage_2_model_ledger: transformer WITHOUT IC-LoRA (used for Stage 2 upsampling)
+      - When no IC-LoRA is provided, both stages use the same base model.
+    """
+    def __init__(
+        self,
+        distilled_checkpoint_path: str,
+        spatial_upsampler_path: str,
+        gemma_root: str,
+        ic_loras: list[LoraPathStrengthAndSDOps] | None = None,
+        device: torch.device | None = None,
+        quantization: QuantizationPolicy | None = None,
+        reference_downscale_factor: int | None = None,
+    ):
+        self.device = device or get_device()
+        self.dtype = torch.bfloat16
+        ic_loras = ic_loras or []
+        self.has_ic_lora = len(ic_loras) > 0
+        # Stage 1: transformer with IC-LoRA (if provided)
+        self.stage_1_model_ledger = ModelLedger(
+            dtype=self.dtype,
+            device=self.device,
+            checkpoint_path=distilled_checkpoint_path,
+            spatial_upsampler_path=spatial_upsampler_path,
+            gemma_root_path=gemma_root,
+            loras=ic_loras,
+            quantization=quantization,
+        )
+        if self.has_ic_lora:
+            # Stage 2 needs a separate transformer WITHOUT IC-LoRA
+            self.stage_2_model_ledger = ModelLedger(
+                dtype=self.dtype,
+                device=self.device,
+                checkpoint_path=distilled_checkpoint_path,
+                spatial_upsampler_path=spatial_upsampler_path,
+                gemma_root_path=gemma_root,
+                loras=[],
+                quantization=quantization,
+            )
+        else:
+            # No IC-LoRA: share a single ledger for both stages (saves ~half VRAM)
+            self.stage_2_model_ledger = self.stage_1_model_ledger
+        self.pipeline_components = PipelineComponents(
+            dtype=self.dtype,
+            device=self.device,
+        )
+        # Reference downscale factor: explicit value takes priority,
+        # otherwise read from IC-LoRA metadata, otherwise default to 1.
+        if reference_downscale_factor is not None:
+            self.reference_downscale_factor = reference_downscale_factor
+        else:
+            self.reference_downscale_factor = 1
+            for lora in ic_loras:
+                scale = _read_lora_reference_downscale_factor(lora.path)
+                if scale != 1:
+                    if self.reference_downscale_factor not in (1, scale):
+                        raise ValueError(
+                            f"Conflicting reference_downscale_factor: "
+                            f"already {self.reference_downscale_factor}, got {scale}"
+                        )
+                    self.reference_downscale_factor = scale
+        logging.info(f"[Pipeline] reference_downscale_factor={self.reference_downscale_factor}")
+    # ── Video reference conditioning (from ICLoraPipeline) ───────────────
+    def _create_ic_conditionings(
+        self,
+        video_conditioning: list[tuple[str, float]],
+        height: int,
+        width: int,
+        num_frames: int,
+        video_encoder: VideoEncoder,
+        conditioning_strength: float = 1.0,
+    ) -> list[ConditioningItem]:
+        """Create IC-LoRA video reference conditioning items."""
+        conditionings: list[ConditioningItem] = []
+        scale = self.reference_downscale_factor
+        ref_height = height // scale
+        ref_width = width // scale
+        for video_path, strength in video_conditioning:
+            video = load_video_conditioning(
+                video_path=video_path,
+                height=ref_height,
+                width=ref_width,
+                frame_cap=num_frames,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            encoded_video = video_encoder(video)
+            cond = VideoConditionByReferenceLatent(
+                latent=encoded_video,
+                downscale_factor=scale,
+                strength=strength,
+            )
+            if conditioning_strength < 1.0:
+                cond = ConditioningItemAttentionStrengthWrapper(
+                    cond, attention_mask=conditioning_strength
+                )
+            conditionings.append(cond)
+        if conditionings:
+            logging.info(f"[IC-LoRA] Added {len(conditionings)} video conditioning(s)")
+        return conditionings
+    # ── Main generation entry point ──────────────────────────────────────
+    def __call__(
+        self,
+        prompt: str,
+        seed: int,
+        height: int,
+        width: int,
+        num_frames: int,
+        frame_rate: float,
+        images: list[ImageConditioningInput],
+        audio_path: str | None = None,
+        video_conditioning: list[tuple[str, float]] | None = None,
+        tiling_config: TilingConfig | None = None,
+        enhance_prompt: bool = False,
+        conditioning_strength: float = 1.0,
+    ):
+        """
+        Generate video with any combination of conditioning.
+        Args:
+            audio_path: Path to external audio file for lipsync/BGM conditioning.
+            video_conditioning: List of (path, strength) tuples for IC-LoRA V2V.
+            conditioning_strength: Scale for IC-LoRA attention influence [0, 1].
+        Returns:
+            Tuple of (decoded_video_iterator, Audio).
+        """
+        assert_resolution(height=height, width=width, is_two_stage=True)
+        prompt += " synchronized lipsync"
+        has_audio = audio_path is not None
+        has_video_cond = bool(video_conditioning)
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        dtype = torch.bfloat16
+        # ── Encode text prompt ───────────────────────────────────────────
+        # Use stage_1 ledger for prompt encoding (has text encoder)
+        (ctx_p,) = encode_prompts(
+            [prompt],
+            self.stage_1_model_ledger,
+            enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0].path if len(images) > 0 else None,
+        )
+        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        # ── Encode external audio (if provided) ─────────────────────────
+        encoded_audio_latent = None
+        decoded_audio_for_output = None
+        if has_audio:
+            video_duration = num_frames / frame_rate
+            decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+            if decoded_audio is None:
+                raise ValueError(f"Could not extract audio stream from {audio_path}")
+            encoded_audio_latent = vae_encode_audio(
+                decoded_audio, self.stage_1_model_ledger.audio_encoder()
+            )
+            audio_shape = AudioLatentShape.from_duration(
+                batch=1, duration=video_duration, channels=8, mel_bins=16
+            )
+            expected_frames = audio_shape.frames
+            actual_frames = encoded_audio_latent.shape[2]
+            if actual_frames > expected_frames:
+                encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+            elif actual_frames < expected_frames:
+                pad = torch.zeros(
+                    encoded_audio_latent.shape[0], encoded_audio_latent.shape[1],
+                    expected_frames - actual_frames, encoded_audio_latent.shape[3],
+                    device=encoded_audio_latent.device, dtype=encoded_audio_latent.dtype,
+                )
+                encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+            decoded_audio_for_output = Audio(
+                waveform=decoded_audio.waveform.squeeze(0),
+                sampling_rate=decoded_audio.sampling_rate,
+            )
+        # ── Build conditionings for Stage 1 ──────────────────────────────
+        # Use stage_1 video encoder (has IC-LoRA context)
+        video_encoder = self.stage_1_model_ledger.video_encoder()
+        stage_1_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames,
+            width=width // 2, height=height // 2, fps=frame_rate,
+        )
+        # Image conditionings
+        stage_1_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        # IC-LoRA video reference conditionings
+        if has_video_cond:
+            ic_conds = self._create_ic_conditionings(
+                video_conditioning=video_conditioning,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                num_frames=num_frames,
+                video_encoder=video_encoder,
+                conditioning_strength=conditioning_strength,
+            )
+            stage_1_conditionings.extend(ic_conds)
+        # ── Stage 1: Low-res generation ──────────────────────────────────
+        transformer = self.stage_1_model_ledger.transformer()
+        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                ),
+            )
+        if has_audio:
+            # Audio mode: denoise video only, use external audio latent
+            video_state = denoise_video_only(
+                output_shape=stage_1_output_shape,
+                conditionings=stage_1_conditionings,
+                noiser=noiser,
+                sigmas=stage_1_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                initial_audio_latent=encoded_audio_latent,
+            )
+            audio_state = None  # we'll use the original audio for output
+        else:
+            # Standard / IC-only mode: denoise both audio and video
+            video_state, audio_state = denoise_audio_video(
+                output_shape=stage_1_output_shape,
+                conditionings=stage_1_conditionings,
+                noiser=noiser,
+                sigmas=stage_1_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+            )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        # ── Stage 2: Upsample + Refine ──────────────────────────────────
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
+        )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        # Stage 2 uses the transformer WITHOUT IC-LoRA
+        transformer_s2 = self.stage_2_model_ledger.transformer()
+        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop_s2(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer_s2,
+                ),
+            )
+        stage_2_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames,
+            width=width, height=height, fps=frame_rate,
+        )
+        stage_2_conditionings = combined_image_conditionings(
+            images=images,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
+        )
+        if has_audio:
+            video_state = denoise_video_only(
+                output_shape=stage_2_output_shape,
+                conditionings=stage_2_conditionings,
+                noiser=noiser,
+                sigmas=stage_2_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop_s2,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                noise_scale=stage_2_sigmas[0],
+                initial_video_latent=upscaled_video_latent,
+                initial_audio_latent=encoded_audio_latent,
+            )
+            audio_state = None
+        else:
+            video_state, audio_state = denoise_audio_video(
+                output_shape=stage_2_output_shape,
+                conditionings=stage_2_conditionings,
+                noiser=noiser,
+                sigmas=stage_2_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop_s2,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                noise_scale=stage_2_sigmas[0],
+                initial_video_latent=upscaled_video_latent,
+                initial_audio_latent=audio_state.latent,
+            )
+        torch.cuda.synchronize()
+        del transformer, transformer_s2, video_encoder
+        cleanup_memory()
+        # ── Decode ───────────────────────────────────────────────────────
+        decoded_video = vae_decode_video(
+            video_state.latent,
+            self.stage_2_model_ledger.video_decoder(),
+            tiling_config,
+            generator,
+        )
+        if has_audio:
+            output_audio = decoded_audio_for_output
+        else:
+            output_audio = vae_decode_audio(
+                audio_state.latent,
+                self.stage_2_model_ledger.audio_decoder(),
+                self.stage_2_model_ledger.vocoder(),
+            )
+        return decoded_video, output_audio
+# ─────────────────────────────────────────────────────────────────────────────
+# Constants
+# ─────────────────────────────────────────────────────────────────────────────
+MAX_SEED = np.iinfo(np.int32).max
+DEFAULT_PROMPT = (
+    "An astronaut hatches from a fragile egg on the surface of the Moon, "
+    "the shell cracking and peeling apart in gentle low-gravity motion."
+)
+DEFAULT_FRAME_RATE = 24.0
+RESOLUTIONS = {
+    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
+    "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
+}
+# Available IC-LoRA models
+IC_LORA_OPTIONS = {
+    "Union Control (Depth + Edge)": {
+        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control",
+        "filename": "ltx-2.3-22b-ic-lora-union-control-ref0.5.safetensors",
+    },
+    "Motion Track Control": {
+        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control",
+        "filename": "ltx-2.3-22b-ic-lora-motion-track-control-ref0.5.safetensors",
+    },
+}
+DEFAULT_IC_LORA = "Union Control (Depth + Edge)"
+# ─────────────────────────────────────────────────────────────────────────────
+# Download Models
+# ─────────────────────────────────────────────────────────────────────────────
+LTX_MODEL_REPO = "Lightricks/LTX-2.3"
+CHECKPOINT_PATH = "linoyts/ltx-2.3-22b-distilled-1.1-fused-union-control" #ltx 2.3 with fused union control lora because it breaks on quantization otherwise
+GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
+print("=" * 80)
+print("Downloading LTX-2.3 distilled model + Gemma + IC-LoRA...")
+print("=" * 80)
+checkpoint_path = hf_hub_download(
+    # repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors"
+    repo_id=CHECKPOINT_PATH, filename="ltx-2.3-22b-distilled-1.1-fused-union-control.safetensors"
+)
+spatial_upsampler_path = hf_hub_download(
+    repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors"
+)
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+# Download default IC-LoRA
+default_lora_info = IC_LORA_OPTIONS[DEFAULT_IC_LORA]
+default_ic_lora_path = hf_hub_download(
+    repo_id=default_lora_info["repo"], filename=default_lora_info["filename"]
+)
+print(f"Checkpoint: {checkpoint_path}")
+print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Gemma root: {gemma_root}")
+print(f"IC-LoRA: {default_ic_lora_path}")
+# ─────────────────────────────────────────────────────────────────────────────
+# Initialize Pipeline
+# ─────────────────────────────────────────────────────────────────────────────
+ic_loras = [
+    LoraPathStrengthAndSDOps(default_ic_lora_path, 1.0, LTXV_LORA_COMFY_RENAMING_MAP)
+]
+pipeline = LTX23UnifiedPipeline(
+    distilled_checkpoint_path=checkpoint_path,
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
+    # ic_loras=ic_loras,  # LoRA already fused into checkpoint
+    quantization=QuantizationPolicy.fp8_cast(),
+    # Union Control IC-LoRA was trained with reference videos at half resolution.
+    # Set explicitly so it works both with separate LoRA and fused checkpoints.
+    reference_downscale_factor=2,
+)
+# Preload all models for ZeroGPU tensor packing.
+print("Preloading all models (including Gemma, Audio encoders)...")
+# Shared ledger: preload once. Separate ledgers (IC-LoRA): preload both.
+_ledger_1 = pipeline.stage_1_model_ledger
+_ledger_2 = pipeline.stage_2_model_ledger
+_shared = _ledger_1 is _ledger_2
+# Stage 1 models (with IC-LoRA if loaded)
+_s1_transformer = _ledger_1.transformer()
+_s1_video_encoder = _ledger_1.video_encoder()
+_s1_text_encoder = _ledger_1.text_encoder()
+_s1_embeddings = _ledger_1.gemma_embeddings_processor()
+_s1_audio_encoder = _ledger_1.audio_encoder()
+_ledger_1.transformer = lambda: _s1_transformer
+_ledger_1.video_encoder = lambda: _s1_video_encoder
+_ledger_1.text_encoder = lambda: _s1_text_encoder
+_ledger_1.gemma_embeddings_processor = lambda: _s1_embeddings
+_ledger_1.audio_encoder = lambda: _s1_audio_encoder
+if _shared:
+    # Single ledger — also preload decoder/upsampler/vocoder on the same object
+    _video_decoder = _ledger_1.video_decoder()
+    _audio_decoder = _ledger_1.audio_decoder()
+    _vocoder = _ledger_1.vocoder()
+    _spatial_upsampler = _ledger_1.spatial_upsampler()
+    _ledger_1.video_decoder = lambda: _video_decoder
+    _ledger_1.audio_decoder = lambda: _audio_decoder
+    _ledger_1.vocoder = lambda: _vocoder
+    _ledger_1.spatial_upsampler = lambda: _spatial_upsampler
+    print("  (single shared ledger — no IC-LoRA)")
+else:
+    # Stage 2 models (separate transformer without IC-LoRA)
+    _s2_transformer = _ledger_2.transformer()
+    _s2_video_encoder = _ledger_2.video_encoder()
+    _s2_video_decoder = _ledger_2.video_decoder()
+    _s2_audio_decoder = _ledger_2.audio_decoder()
+    _s2_vocoder = _ledger_2.vocoder()
+    _s2_spatial_upsampler = _ledger_2.spatial_upsampler()
+    _s2_text_encoder = _ledger_2.text_encoder()
+    _s2_embeddings = _ledger_2.gemma_embeddings_processor()
+    _s2_audio_encoder = _ledger_2.audio_encoder()
+    _ledger_2.transformer = lambda: _s2_transformer
+    _ledger_2.video_encoder = lambda: _s2_video_encoder
+    _ledger_2.video_decoder = lambda: _s2_video_decoder
+    _ledger_2.audio_decoder = lambda: _s2_audio_decoder
+    _ledger_2.vocoder = lambda: _s2_vocoder
+    _ledger_2.spatial_upsampler = lambda: _s2_spatial_upsampler
+    _ledger_2.text_encoder = lambda: _s2_text_encoder
+    _ledger_2.gemma_embeddings_processor = lambda: _s2_embeddings
+    _ledger_2.audio_encoder = lambda: _s2_audio_encoder
+    print("  (two separate ledgers — IC-LoRA active)")
+print("All models preloaded!")
+print("=" * 80)
+# ─────────────────────────────────────────────────────────────────────────────
+# UI Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+def detect_aspect_ratio(media_path) -> str:
+    """Detect the closest aspect ratio from an image or video."""
+    if media_path is None:
+        return "16:9"
+    ext = str(media_path).lower().rsplit(".", 1)[-1] if "." in str(media_path) else ""
+    # Try as image first
+    if ext in ("jpg", "jpeg", "png", "bmp", "webp", "gif", "tiff"):
+        import PIL.Image
+        try:
+            with PIL.Image.open(media_path) as img:
+                w, h = img.size
+        except Exception:
+            return "16:9"
+    else:
+        # Try as video
+        try:
+            import av
+            with av.open(str(media_path)) as container:
+                stream = container.streams.video[0]
+                w, h = stream.codec_context.width, stream.codec_context.height
+        except Exception:
+            # Fallback: try as image anyway
+            import PIL.Image
+            try:
+                with PIL.Image.open(media_path) as img:
+                    w, h = img.size
+            except Exception:
+                return "16:9"
+    ratio = w / h
+    candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
+    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def on_image_upload(image, video, high_res):
+    """Auto-set resolution when image is uploaded."""
+    media = image if image is not None else video
+    aspect = detect_aspect_ratio(media)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+def _get_video_duration(video_path) -> float | None:
+    """Get video duration in seconds via ffprobe."""
+    if video_path is None:
+        return None
+    try:
+        result = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "v:0",
+             "-show_entries", "format=duration", "-of", "default=nw=1:nk=1",
+             str(video_path)],
+            capture_output=True, text=True,
+        )
+        return float(result.stdout.strip())
+    except Exception:
+        return None
+def on_video_upload(video, image, high_res):
+    """Auto-set resolution and duration when video is uploaded."""
+    media = video if video is not None else image
+    aspect = detect_aspect_ratio(media)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    # Auto-adjust duration to min(video_length, 10)
+    vid_dur = _get_video_duration(video)
+    if vid_dur is not None:
+        dur = round(min(vid_dur, 15.0), 1)
+    else:
+        dur = 3.0
+    return gr.update(value=w), gr.update(value=h), gr.update(value=dur)
+def on_highres_toggle(image, video, high_res):
+    """Update resolution when high-res toggle changes."""
+    media = image if image is not None else video
+    aspect = detect_aspect_ratio(media)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+# ─────────────────────────────────────────────────────────────────────────────
+# Generation
+# ─────────────────────────────────────────────────────────────────────────────
+def _extract_audio_from_video(video_path: str) -> str | None:
+    """Extract audio from video as a temp WAV file. Returns None if no audio."""
+    out_path = tempfile.mktemp(suffix=".wav")
+    try:
+        # Check if video has an audio stream
+        probe = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "a:0",
+             "-show_entries", "stream=codec_type", "-of", "default=nw=1:nk=1",
+             video_path],
+            capture_output=True, text=True,
+        )
+        if not probe.stdout.strip():
+            return None
+        # Extract audio
+        subprocess.run(
+            ["ffmpeg", "-y", "-v", "error", "-i", video_path,
+             "-vn", "-ac", "2", "-ar", "48000", "-c:a", "pcm_s16le", out_path],
+            check=True,
+        )
+        return out_path
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return None
+@spaces.GPU(duration=100)
+@torch.inference_mode()
+def generate_video(
+    input_image,
+    input_video,
+    prompt: str = "",
+    duration: float = 3,
+    conditioning_strength: float = 0.85,
+    enhance_prompt: bool = True,
+    use_video_audio: bool = True,
+    seed: int = 42,
+    randomize_seed: bool = True,
+    height: int = 512,
+    width: int = 768,
+    input_audio = None,
+    progress=gr.Progress(track_tqdm=True),
+):
+    video_preprocess="Pose (DWPose)"
+    try:
+        torch.cuda.reset_peak_memory_stats()
+        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+        frame_rate = DEFAULT_FRAME_RATE
+        num_frames = int(duration * frame_rate) + 1
+        num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
+        mode_parts = []
+        if input_image is not None:
+            mode_parts.append("Image")
+        if input_video is not None:
+            mode_parts.append(f"Video({video_preprocess})")
+        if input_audio is not None:
+            mode_parts.append("Audio")
+        if not mode_parts:
+            mode_parts.append("Text")
+        mode_str = " + ".join(mode_parts)
+        print(f"[{mode_str}] Generating: {height}x{width}, {num_frames} frames "
+              f"({duration}s), seed={current_seed}")
+        # Build image conditionings
+        images = []
+        if input_image is not None:
+            images = [ImageConditioningInput(path=str(input_image), frame_idx=0, strength=1.0)]
+        # Build video conditionings — preprocess to strip appearance
+        video_conditioning = None
+        if input_video is not None:
+            video_path = str(input_video)
+            if video_preprocess != "Raw (no preprocessing)":
+                print(f"[Preprocess] Running {video_preprocess} on input video...")
+                cond_mp4, first_frame_png = preprocess_conditioning_video(
+                    video_path=video_path,
+                    mode=video_preprocess,
+                    width=int(width) // 2,   # Stage 1 operates at half res
+                    height=int(height) // 2,
+                    num_frames=num_frames,
+                    fps=frame_rate,
+                )
+                video_conditioning = [(cond_mp4, 1.0)]
+                # If no image was provided, use the video's first frame
+                # (original appearance) as the image conditioning
+                if input_image is None:
+                    images = [ImageConditioningInput(
+                        path=first_frame_png, frame_idx=0, strength=1.0,
+                    )]
+                    print(f"[Preprocess] Using video first frame as image conditioning")
+            else:
+                # Raw mode — pass video as-is
+                video_conditioning = [(video_path, 1.0)]
+            # If no audio was provided, optionally extract audio from the video
+            if input_audio is None and use_video_audio:
+                extracted_audio = _extract_audio_from_video(video_path)
+                if extracted_audio is not None:
+                    input_audio = extracted_audio
+                    print(f"[Preprocess] Extracted audio from input video")
+        tiling_config = TilingConfig.default()
+        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        video, audio = pipeline(
+            prompt=prompt,
+            seed=current_seed,
+            height=int(height),
+            width=int(width),
+            num_frames=num_frames,
+            frame_rate=frame_rate,
+            images=images,
+            audio_path=input_audio,
+            video_conditioning=video_conditioning,
+            tiling_config=tiling_config,
+            enhance_prompt=enhance_prompt,
+            conditioning_strength=conditioning_strength,
+        )
+        output_path = tempfile.mktemp(suffix=".mp4")
+        encode_video(
+            video=video,
+            fps=frame_rate,
+            audio=audio,
+            output_path=output_path,
+            video_chunks_number=video_chunks_number,
+        )
+        return str(output_path), current_seed
+    except Exception as e:
+        import traceback
+        print(f"Error: {str(e)}\n{traceback.format_exc()}")
+        return None, current_seed
+# ─────────────────────────────────────────────────────────────────────────────
+# SmolVLM2 — Auto-describe motion from reference video
+# ─────────────────────────────────────────────────────────────────────────────
+SMOLVLM_MODEL_ID = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct"
+_vlm_model = None
+_vlm_processor = None
+MOTION_PROMPT = """\
+Watch this video carefully. Describe ONLY the following:
+1. The body movements and gestures (walking, dancing, waving, turning, etc.)
+2. Facial expressions and head movements (smiling, nodding, looking around, etc.)
+3. The rhythm, speed, and energy of the motion (slow, fast, smooth, jerky, etc.)
+4. The overall mood and tone conveyed by the movement
+Do NOT describe:
+- What the person/subject looks like (clothing, hair, skin, age, gender)
+- The background, setting, or environment
+- Colors, lighting, or visual style
+- Any objects or props
+Write a concise, single-paragraph description focused purely on motion and expression.\
+"""
+def _load_vlm():
+    global _vlm_model, _vlm_processor
+    if _vlm_model is None:
+        from transformers import AutoProcessor, AutoModelForImageTextToText
+        print(f"[SmolVLM] Loading {SMOLVLM_MODEL_ID}...")
+        _vlm_processor = AutoProcessor.from_pretrained(
+            SMOLVLM_MODEL_ID, trust_remote_code=True
+        )
+        try:
+            _vlm_model = AutoModelForImageTextToText.from_pretrained(
+                SMOLVLM_MODEL_ID,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+                _attn_implementation="flash_attention_2",
+            ).to("cuda")
+        except Exception:
+            _vlm_model = AutoModelForImageTextToText.from_pretrained(
+                SMOLVLM_MODEL_ID,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+            ).to("cuda")
+        print("[SmolVLM] Model loaded!")
+    return _vlm_model, _vlm_processor
+@spaces.GPU(duration=60)
+@torch.inference_mode()
+def describe_video_motion(video_path, auto_describe=True):
+    """Use SmolVLM2 to generate a motion-only description of a video."""
+    if video_path is None or not auto_describe:
+        return gr.update()
+    try:
+        model, processor = _load_vlm()
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "video", "path": str(video_path)},
+                    {"type": "text", "text": MOTION_PROMPT},
+                ],
+            },
+        ]
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(model.device, dtype=torch.bfloat16)
+        generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=200)
+        generated_text = processor.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )[0]
+        # Extract only the assistant's response (after the prompt)
+        if "Assistant:" in generated_text:
+            motion_desc = generated_text.split("Assistant:")[-1].strip()
+        else:
+            motion_desc = generated_text.strip()
+        # Clean up any leftover prompt fragments
+        for marker in [MOTION_PROMPT[:40], "Watch this video", "Do NOT describe"]:
+            if marker in motion_desc:
+                motion_desc = motion_desc.split(marker)[0].strip()
+        if motion_desc:
+            print(f"[SmolVLM] Motion description: {motion_desc[:100]}...")
+            return gr.update(value=motion_desc)
+        else:
+            return gr.update()
+    except Exception as e:
+        print(f"[SmolVLM] Error: {e}")
+        return gr.update()
+# ─────────────────────────────────────────────────────────────────────────────
+# Gradio UI — LTX 2.3 Sync
+# ─────────────────────────────────────────────────────────────────────────────
+css = """
+.main-title { text-align: center; margin-bottom: 0.5em; }
+.generate-btn { min-height: 52px !important; font-size: 1.1em !important; }
+footer { display: none !important; }
+video { object-fit: contain !important; }
+"""
+purple_citrus = gr.themes.Citrus(
+    primary_hue=gr.themes.colors.purple,
+    secondary_hue=gr.themes.colors.purple,
+    neutral_hue=gr.themes.colors.gray,
+)
+with gr.Blocks(title="LTX 2.3 Sync", css=css, theme=purple_citrus) as demo:
+    gr.Markdown("""
+# LTX 2.3 Sync: Fast Character Animation🕺
+**Fast Character Animation with LTX 2.3 Distilled**, using [Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control](https://huggingface.co/Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control) with pose estimation & custom audio inputs for precise lipsync and body movement replication ✨
+    """)
+    # Hidden state — preprocessing is always Pose
+    video_preprocess = gr.State("Pose (DWPose)")
+    with gr.Row():
+        # ── Left column: inputs ──────────────────────────────────────
+        with gr.Column(scale=1):
+            with gr.Row():
+                input_image = gr.Image(
+                    label="Character reference",
+                    type="filepath",
+                )
+                input_video = gr.Video(
+                    label="Motion & audio reference",
+                )
+            with gr.Row():
+                with gr.Column(min_width=160):
+                    prompt = gr.Textbox(
+                    label="Prompt (optional)",
+                    info="tip: describe the motion, body posture, facial expressions of the ref video",
+                    lines=2,
+                    placeholder="the person talks to the camera, making hand gestures",
+                )
+                    duration = gr.Slider(
+                        label="Duration (s)", minimum=1.0, maximum=15.0, value=3.0, step=0.5,
+                    )
+                    auto_describe = gr.Checkbox(
+                        label="Auto-describe motion", value=False, visible=False,
+                        info="Use AI to describe the video's motion as a prompt",
+                    )
+            generate_btn = gr.Button(
+                "Generate", variant="primary", size="lg", elem_classes=["generate-btn"],
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
+                conditioning_strength = gr.Slider(
+                    label="V2V Conditioning Strength",
+                    info="How closely to follow the reference video's structure",
+                    minimum=0.0, maximum=1.0, value=0.85, step=0.05,
+                )
+                high_res = gr.Checkbox(label="High Resolution (2×)", value=False)
+                use_video_audio = gr.Checkbox(
+                    label="Use Audio from Video", value=True,
+                    info="Extract the audio track from the motion source video",
+                )
+                input_audio = gr.Audio(
+                    label="Override Audio (optional — replaces video audio)",
+                    type="filepath",
+                )
+                seed = gr.Slider(
+                    label="Seed", minimum=0, maximum=MAX_SEED, value=42, step=1,
+                )
+                randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                with gr.Row():
+                    width = gr.Number(label="Width", value=768, precision=0)
+                    height = gr.Number(label="Height", value=512, precision=0)
+        # ── Right column: output ─────────────────────────────────────
+        with gr.Column(scale=1):
+            output_video = gr.Video(label="Result", autoplay=True, height=480)
+    gr.Examples(
+        examples=[
+            [
+                "britney-spears-toxic-2004.jpg",
+                "example_2.mp4",
+                "",
+                3.4,
+                0.85,
+                False,
+                True,
+                1824535108,
+                False,
+                512,
+                768,
+            ],
+            [
+                "1 1.jpeg",
+                "1 (2).mp4",
+                "a man speaking while making hand gestures",
+                3.5,
+                0.9,
+                False,
+                True,
+                1723325627,
+                False,
+                512,
+                768,
+            ],
+            [
+                "2 (1).jpeg",
+                "video-5.mp4",
+                "",
+                6.8,
+                0.9,
+                False,
+                True,
+                42,
+                True,
+                512,
+                768,
+            ],
+        ],
+        inputs=[
+            input_image,
+            input_video,
+            prompt,
+            duration,
+            conditioning_strength,
+            enhance_prompt,
+            use_video_audio,
+            seed,
+            randomize_seed,
+            height,
+            width,
+        ],
+        fn = generate_video,
+        cache_examples=True,
+        cache_mode="lazy",
+        outputs=[output_video, seed],
+    )
+    # ── Event handlers ───────────────────────────────────────────────────
+    input_image.change(
+        fn=on_image_upload,
+        inputs=[input_image, input_video, high_res],
+        outputs=[width, height],
+    )
+    input_video.change(
+        fn=on_video_upload,
+        inputs=[input_video, input_image, high_res],
+        outputs=[width, height, duration],
+    )
+    high_res.change(
+        fn=on_highres_toggle,
+        inputs=[input_image, input_video, high_res],
+        outputs=[width, height],
+    )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            input_image, input_video, prompt, duration,
+            conditioning_strength, enhance_prompt,
+            use_video_audio, seed, randomize_seed, height, width,input_audio
+        ],
+        outputs=[output_video, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()