import os
import time

import gradio as gr
import torch
from diffusers import AutoencoderKLCogVideoX, CogVideoXDDIMScheduler
from diffusers.utils import export_to_video
from huggingface_hub import login, snapshot_download
from PIL import Image
from transformers import T5EncoderModel, T5Tokenizer

from cogvideo_transformer import CustomCogVideoXTransformer3DModel
from EF_Net import EF_Net
from Sci_Fi_inbetweening_pipeline import CogVideoXEFNetInbetweeningPipeline

# Authenticate with Hugging Face
try:
    token = os.environ.get("HF_TOKEN")
    if token:
        login(token=token)
        print("Successfully authenticated with Hugging Face")
    else:
        print("Warning: HF_TOKEN not found")
except Exception as e:
    print(f"Warning: Could not authenticate with HF: {e}")

device = "cuda" if torch.cuda.is_available() else "cpu"


def load_pipeline(dtype_str="bfloat16"):
    """Load the Sci-Fi pipeline at startup"""
    print("Loading Sci-Fi pipeline...")

    dtype = torch.float16 if dtype_str == "float16" else torch.bfloat16

    # Download the entire model repository
    print("Downloading model repository from Hugging Face...")
    repo_path = snapshot_download(
        repo_id="LiuhanChen/Sci-Fi",
        local_dir="./Sci-Fi-models",
        token=os.environ.get("HF_TOKEN"),
        ignore_patterns=["*.md", "*.txt", ".gitattributes"],  # Skip unnecessary files
    )
    print(f"Models downloaded to: {repo_path}")

    # Set paths
    model_base_path = repo_path
    cogvideo_path = os.path.join(model_base_path, "CogVideoX-5b-I2V")
    ef_net_path = os.path.join(
        model_base_path, "EF_Net", "EF_Net.pt"
    )  # Changed from .pth to .pt

    print(f"CogVideo path: {cogvideo_path}")
    print(f"EF-Net path: {ef_net_path}")

    # Verify the EF_Net file exists
    if not os.path.exists(ef_net_path):
        # Try to list files in the EF_Net directory to debug
        ef_net_dir = os.path.join(model_base_path, "EF_Net")
        if os.path.exists(ef_net_dir):
            print(f"Files in EF_Net directory: {os.listdir(ef_net_dir)}")
        raise FileNotFoundError(f"EF-Net weights not found at {ef_net_path}")

    # Load models
    print("Loading tokenizer and text encoder...")
    tokenizer = T5Tokenizer.from_pretrained(os.path.join(cogvideo_path, "tokenizer"))
    text_encoder = T5EncoderModel.from_pretrained(
        os.path.join(cogvideo_path, "text_encoder")
    )

    print("Loading transformer...")
    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
        os.path.join(cogvideo_path, "transformer")
    )

    print("Loading VAE...")
    vae = AutoencoderKLCogVideoX.from_pretrained(os.path.join(cogvideo_path, "vae"))

    print("Loading scheduler...")
    scheduler = CogVideoXDDIMScheduler.from_pretrained(
        os.path.join(cogvideo_path, "scheduler")
    )

    # Load EF-Net
    print(f"Loading EF-Net from {ef_net_path}...")
    EF_Net_model = (
        EF_Net(num_layers=4, downscale_coef=8, in_channels=2, num_attention_heads=48)
        .requires_grad_(False)
        .eval()
    )

    ckpt = torch.load(ef_net_path, map_location="cpu", weights_only=False)
    EF_Net_state_dict = {name: params for name, params in ckpt["state_dict"].items()}
    m, u = EF_Net_model.load_state_dict(EF_Net_state_dict, strict=False)
    print(f"[EF-Net loaded] Missing: {len(m)} | Unexpected: {len(u)}")

    # Create pipeline
    print("Creating pipeline...")
    pipeline = CogVideoXEFNetInbetweeningPipeline(
        tokenizer=tokenizer,
        text_encoder=text_encoder,
        transformer=transformer,
        vae=vae,
        EF_Net_model=EF_Net_model,
        scheduler=scheduler,
    )
    pipeline.scheduler = CogVideoXDDIMScheduler.from_config(
        pipeline.scheduler.config, timestep_spacing="trailing"
    )

    print(f"Moving pipeline to {device}...")
    pipeline.to(device)
    pipeline = pipeline.to(dtype=dtype)

    pipeline.vae.enable_slicing()
    pipeline.vae.enable_tiling()

    print("Pipeline loaded successfully!")
    return pipeline


# Load pipeline at startup
print("Initializing Sci-Fi pipeline at startup...")
pipe = load_pipeline()


def generate_inbetweening(
    first_image: Image.Image,
    last_image: Image.Image,
    prompt: str,
    num_frames: int = 49,
    guidance_scale: float = 6.0,
    ef_net_weights: float = 1.0,
    ef_net_guidance_start: float = 0.0,
    ef_net_guidance_end: float = 1.0,
    seed: int = 42,
    progress=gr.Progress(),
):
    """Generate frame inbetweening video"""
    if first_image is None or last_image is None:
        return None, "Please upload both start and end frames!"

    if not prompt.strip():
        return None, "Please provide a text prompt!"

    try:
        progress(0.2, desc="Starting generation...")
        start_time = time.time()

        # Generate video
        progress(0.4, desc="Processing frames...")
        video_frames = pipe(
            first_image=first_image,
            last_image=last_image,
            prompt=prompt,
            num_frames=num_frames,
            use_dynamic_cfg=False,
            guidance_scale=guidance_scale,
            generator=torch.Generator(device=device).manual_seed(seed),
            EF_Net_weights=ef_net_weights,
            EF_Net_guidance_start=ef_net_guidance_start,
            EF_Net_guidance_end=ef_net_guidance_end,
        ).frames[0]

        progress(0.9, desc="Exporting video...")

        # Export video
        output_path = f"output_{int(time.time())}.mp4"
        export_to_video(video_frames, output_path, fps=7)

        elapsed_time = time.time() - start_time
        status_msg = f"Video generated successfully in {elapsed_time:.2f}s"

        progress(1.0, desc="Done!")
        return output_path, status_msg

    except Exception as e:
        return None, f"Error: {str(e)}"


# Create Gradio interface
with gr.Blocks(title="Sci-Fi: Frame Inbetweening") as demo:
    gr.Markdown(
        """
    # Sci-Fi: Symmetric Constraint for Frame Inbetweening

    Upload start and end frames to generate smooth inbetweening video.

    **Model is pre-loaded and ready to use!**
    """
    )

    with gr.Tab("Generate"):
        with gr.Row():
            with gr.Column():
                first_image = gr.Image(label="Start Frame", type="pil")
                last_image = gr.Image(label="End Frame", type="pil")

            with gr.Column():
                prompt = gr.Textbox(
                    label="Prompt",
                    placeholder="Describe the motion or content...",
                    lines=3,
                )

                with gr.Accordion("Advanced Settings", open=False):
                    num_frames = gr.Slider(
                        minimum=13,
                        maximum=49,
                        value=49,
                        step=12,
                        label="Number of Frames",
                    )
                    guidance_scale = gr.Slider(
                        minimum=1.0,
                        maximum=15.0,
                        value=6.0,
                        step=0.5,
                        label="Guidance Scale",
                    )
                    ef_net_weights = gr.Slider(
                        minimum=0.0,
                        maximum=2.0,
                        value=1.0,
                        step=0.1,
                        label="EF-Net Weights",
                    )
                    ef_net_guidance_start = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=0.0,
                        step=0.1,
                        label="EF-Net Guidance Start",
                    )
                    ef_net_guidance_end = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=1.0,
                        step=0.1,
                        label="EF-Net Guidance End",
                    )
                    seed = gr.Number(label="Seed", value=42, precision=0)

                generate_btn = gr.Button("Generate Video", variant="primary", size="lg")

        with gr.Row():
            output_video = gr.Video(label="Generated Video")
            status_text = gr.Textbox(label="Status", lines=2)

        generate_btn.click(
            fn=generate_inbetweening,
            inputs=[
                first_image,
                last_image,
                prompt,
                num_frames,
                guidance_scale,
                ef_net_weights,
                ef_net_guidance_start,
                ef_net_guidance_end,
                seed,
            ],
            outputs=[output_video, status_text],
        )

    with gr.Tab("Examples"):
        gr.Markdown(
            """
        ## Example Inputs

        Try these example frame pairs from the `example_input_pairs/` folder.
        """
        )

        gr.Examples(
            examples=[
                [
                    "example_input_pairs/input_pair1/start.jpg",
                    "example_input_pairs/input_pair1/end.jpg",
                    "A smooth transition between frames",
                ],
                [
                    "example_input_pairs/input_pair2/start.jpg",
                    "example_input_pairs/input_pair2/end.jpg",
                    "Natural motion interpolation",
                ],
            ],
            inputs=[first_image, last_image, prompt],
        )

if __name__ == "__main__":
    print("App ready - pipeline is loaded and ready for inference!")
    demo.launch()