Sci-Fi / app.py
AhmadMustafa's picture
rename
1c4a55b
import os
import time
import gradio as gr
import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXDDIMScheduler
from diffusers.utils import export_to_video
from huggingface_hub import login, snapshot_download
from PIL import Image
from transformers import T5EncoderModel, T5Tokenizer
from cogvideo_transformer import CustomCogVideoXTransformer3DModel
from EF_Net import EF_Net
from Sci_Fi_inbetweening_pipeline import CogVideoXEFNetInbetweeningPipeline
# Authenticate with Hugging Face
try:
token = os.environ.get("HF_TOKEN")
if token:
login(token=token)
print("Successfully authenticated with Hugging Face")
else:
print("Warning: HF_TOKEN not found")
except Exception as e:
print(f"Warning: Could not authenticate with HF: {e}")
device = "cuda" if torch.cuda.is_available() else "cpu"
def load_pipeline(dtype_str="bfloat16"):
"""Load the Sci-Fi pipeline at startup"""
print("Loading Sci-Fi pipeline...")
dtype = torch.float16 if dtype_str == "float16" else torch.bfloat16
# Download the entire model repository
print("Downloading model repository from Hugging Face...")
repo_path = snapshot_download(
repo_id="LiuhanChen/Sci-Fi",
local_dir="./Sci-Fi-models",
token=os.environ.get("HF_TOKEN"),
ignore_patterns=["*.md", "*.txt", ".gitattributes"], # Skip unnecessary files
)
print(f"Models downloaded to: {repo_path}")
# Set paths
model_base_path = repo_path
cogvideo_path = os.path.join(model_base_path, "CogVideoX-5b-I2V")
ef_net_path = os.path.join(
model_base_path, "EF_Net", "EF_Net.pt"
) # Changed from .pth to .pt
print(f"CogVideo path: {cogvideo_path}")
print(f"EF-Net path: {ef_net_path}")
# Verify the EF_Net file exists
if not os.path.exists(ef_net_path):
# Try to list files in the EF_Net directory to debug
ef_net_dir = os.path.join(model_base_path, "EF_Net")
if os.path.exists(ef_net_dir):
print(f"Files in EF_Net directory: {os.listdir(ef_net_dir)}")
raise FileNotFoundError(f"EF-Net weights not found at {ef_net_path}")
# Load models
print("Loading tokenizer and text encoder...")
tokenizer = T5Tokenizer.from_pretrained(os.path.join(cogvideo_path, "tokenizer"))
text_encoder = T5EncoderModel.from_pretrained(
os.path.join(cogvideo_path, "text_encoder")
)
print("Loading transformer...")
transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
os.path.join(cogvideo_path, "transformer")
)
print("Loading VAE...")
vae = AutoencoderKLCogVideoX.from_pretrained(os.path.join(cogvideo_path, "vae"))
print("Loading scheduler...")
scheduler = CogVideoXDDIMScheduler.from_pretrained(
os.path.join(cogvideo_path, "scheduler")
)
# Load EF-Net
print(f"Loading EF-Net from {ef_net_path}...")
EF_Net_model = (
EF_Net(num_layers=4, downscale_coef=8, in_channels=2, num_attention_heads=48)
.requires_grad_(False)
.eval()
)
ckpt = torch.load(ef_net_path, map_location="cpu", weights_only=False)
EF_Net_state_dict = {name: params for name, params in ckpt["state_dict"].items()}
m, u = EF_Net_model.load_state_dict(EF_Net_state_dict, strict=False)
print(f"[EF-Net loaded] Missing: {len(m)} | Unexpected: {len(u)}")
# Create pipeline
print("Creating pipeline...")
pipeline = CogVideoXEFNetInbetweeningPipeline(
tokenizer=tokenizer,
text_encoder=text_encoder,
transformer=transformer,
vae=vae,
EF_Net_model=EF_Net_model,
scheduler=scheduler,
)
pipeline.scheduler = CogVideoXDDIMScheduler.from_config(
pipeline.scheduler.config, timestep_spacing="trailing"
)
print(f"Moving pipeline to {device}...")
pipeline.to(device)
pipeline = pipeline.to(dtype=dtype)
pipeline.vae.enable_slicing()
pipeline.vae.enable_tiling()
print("Pipeline loaded successfully!")
return pipeline
# Load pipeline at startup
print("Initializing Sci-Fi pipeline at startup...")
pipe = load_pipeline()
def generate_inbetweening(
first_image: Image.Image,
last_image: Image.Image,
prompt: str,
num_frames: int = 49,
guidance_scale: float = 6.0,
ef_net_weights: float = 1.0,
ef_net_guidance_start: float = 0.0,
ef_net_guidance_end: float = 1.0,
seed: int = 42,
progress=gr.Progress(),
):
"""Generate frame inbetweening video"""
if first_image is None or last_image is None:
return None, "Please upload both start and end frames!"
if not prompt.strip():
return None, "Please provide a text prompt!"
try:
progress(0.2, desc="Starting generation...")
start_time = time.time()
# Generate video
progress(0.4, desc="Processing frames...")
video_frames = pipe(
first_image=first_image,
last_image=last_image,
prompt=prompt,
num_frames=num_frames,
use_dynamic_cfg=False,
guidance_scale=guidance_scale,
generator=torch.Generator(device=device).manual_seed(seed),
EF_Net_weights=ef_net_weights,
EF_Net_guidance_start=ef_net_guidance_start,
EF_Net_guidance_end=ef_net_guidance_end,
).frames[0]
progress(0.9, desc="Exporting video...")
# Export video
output_path = f"output_{int(time.time())}.mp4"
export_to_video(video_frames, output_path, fps=7)
elapsed_time = time.time() - start_time
status_msg = f"Video generated successfully in {elapsed_time:.2f}s"
progress(1.0, desc="Done!")
return output_path, status_msg
except Exception as e:
return None, f"Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Sci-Fi: Frame Inbetweening") as demo:
gr.Markdown(
"""
# Sci-Fi: Symmetric Constraint for Frame Inbetweening
Upload start and end frames to generate smooth inbetweening video.
**Model is pre-loaded and ready to use!**
"""
)
with gr.Tab("Generate"):
with gr.Row():
with gr.Column():
first_image = gr.Image(label="Start Frame", type="pil")
last_image = gr.Image(label="End Frame", type="pil")
with gr.Column():
prompt = gr.Textbox(
label="Prompt",
placeholder="Describe the motion or content...",
lines=3,
)
with gr.Accordion("Advanced Settings", open=False):
num_frames = gr.Slider(
minimum=13,
maximum=49,
value=49,
step=12,
label="Number of Frames",
)
guidance_scale = gr.Slider(
minimum=1.0,
maximum=15.0,
value=6.0,
step=0.5,
label="Guidance Scale",
)
ef_net_weights = gr.Slider(
minimum=0.0,
maximum=2.0,
value=1.0,
step=0.1,
label="EF-Net Weights",
)
ef_net_guidance_start = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.1,
label="EF-Net Guidance Start",
)
ef_net_guidance_end = gr.Slider(
minimum=0.0,
maximum=1.0,
value=1.0,
step=0.1,
label="EF-Net Guidance End",
)
seed = gr.Number(label="Seed", value=42, precision=0)
generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
with gr.Row():
output_video = gr.Video(label="Generated Video")
status_text = gr.Textbox(label="Status", lines=2)
generate_btn.click(
fn=generate_inbetweening,
inputs=[
first_image,
last_image,
prompt,
num_frames,
guidance_scale,
ef_net_weights,
ef_net_guidance_start,
ef_net_guidance_end,
seed,
],
outputs=[output_video, status_text],
)
with gr.Tab("Examples"):
gr.Markdown(
"""
## Example Inputs
Try these example frame pairs from the `example_input_pairs/` folder.
"""
)
gr.Examples(
examples=[
[
"example_input_pairs/input_pair1/start.jpg",
"example_input_pairs/input_pair1/end.jpg",
"A smooth transition between frames",
],
[
"example_input_pairs/input_pair2/start.jpg",
"example_input_pairs/input_pair2/end.jpg",
"Natural motion interpolation",
],
],
inputs=[first_image, last_image, prompt],
)
if __name__ == "__main__":
print("App ready - pipeline is loaded and ready for inference!")
demo.launch()