Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on 26 days ago

Commit

3da6811

verified ·

1 Parent(s): 0feb44a

Update app.py

Browse files

Files changed (1) hide show

app.py +603 -695

app.py CHANGED Viewed

@@ -1,31 +1,76 @@
 import gradio as gr
 import tensorflow as tf
 import keras
 from huggingface_hub import hf_hub_download
 import json
-import os
-from tokenizers import Tokenizer
 import numpy as np
 import time
-# ============================================================================
-# 🎊 FESTIVE MODE TOGGLE 🎊
-# ============================================================================
-FESTIVE = True  # Set to False for production-only mode
-# ============================================================================
-# Configuration & Model Loading
-# ============================================================================
-print("🚀 Loading SAM-Z-1 Model...")
-MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
-CACHE_DIR = "./model_cache"
-# ============================================================================
-# Model Architecture Definitions (FIXED for model loading)
-# ============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
     def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
@@ -36,18 +81,14 @@ class RotaryEmbedding(keras.layers.Layer):
         self.built_cache = False
     def build(self, input_shape):
-        # Use the ORIGINAL training code - compute cache on first call, not in build
         super().build(input_shape)
     def _build_cache(self):
-        """Build RoPE cache on first forward pass"""
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
-            # Store as numpy arrays to avoid graph issues
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
@@ -57,17 +98,13 @@ class RotaryEmbedding(keras.layers.Layer):
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
-        # Build cache on first call (avoids build-time issues)
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
         q_rotated = (q * cos) + (self.rotate_half(q) * sin)
         k_rotated = (k * cos) + (self.rotate_half(k) * sin)
         return q_rotated, k_rotated
     def get_config(self):
@@ -75,7 +112,6 @@ class RotaryEmbedding(keras.layers.Layer):
         config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
         return config
 @keras.saving.register_keras_serializable()
 class RMSNorm(keras.layers.Layer):
     def __init__(self, epsilon=1e-5, **kwargs):
@@ -94,7 +130,6 @@ class RMSNorm(keras.layers.Layer):
         config.update({"epsilon": self.epsilon})
         return config
 @keras.saving.register_keras_serializable()
 class TransformerBlock(keras.layers.Layer):
     def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
@@ -110,25 +145,20 @@ class TransformerBlock(keras.layers.Layer):
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
-        # Attention
         res = x
         y = self.pre_attn_norm(x)
@@ -139,19 +169,14 @@ class TransformerBlock(keras.layers.Layer):
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
-        mask = tf.where(
-            tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
-            tf.constant(-1e9, dtype=dtype),
-            tf.constant(0.0, dtype=dtype)
-        )
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
-        # FFN (SwiGLU)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
@@ -161,17 +186,12 @@ class TransformerBlock(keras.layers.Layer):
     def get_config(self):
         config = super().get_config()
         config.update({
-            "d_model": self.d_model,
-            "n_heads": self.n_heads,
-            "ff_dim": self.ff_dim,
-            "dropout": self.dropout_rate,
-            "max_len": self.max_len,
-            "rope_theta": self.rope_theta,
-            "layer_idx": self.layer_idx
         })
         return config
 @keras.saving.register_keras_serializable()
 class SAM1Model(keras.Model):
     def __init__(self, **kwargs):
@@ -184,31 +204,22 @@ class SAM1Model(keras.Model):
             self.cfg = kwargs.get('cfg', kwargs)
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
         block_args = {
-            'd_model': self.cfg['d_model'],
-            'n_heads': self.cfg['n_heads'],
-            'ff_dim': ff_dim,
-            'dropout': self.cfg['dropout'],
-            'max_len': self.cfg['max_len'],
-            'rope_theta': self.cfg['rope_theta']
         }
-        self.blocks = []
-        for i in range(self.cfg['n_layers']):
-            block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
-            self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
@@ -216,704 +227,601 @@ class SAM1Model(keras.Model):
         base_config['config'] = self.cfg
         return base_config
-print("✅ Model architecture registered")
-# Download model files
-config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
-# Try to download checkpoint weights first (more reliable)
-try:
-    weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
-    print("✅ Found checkpoint weights (ckpt.weights.h5)")
-    use_checkpoint = True
-except Exception as e:
-    print(f"⚠️  Checkpoint not found, falling back to model.keras: {e}")
-    model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
-    use_checkpoint = False
-# Load config
-with open(config_path, 'r') as f:
-    config = json.load(f)
-# Create tokenizer from scratch
-print("📦 Creating tokenizer from GPT-2 base...")
-from transformers import AutoTokenizer
-hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-# Add custom tokens to match model's vocab size
-custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
-hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
-# Save and reload as tokenizers format
-os.makedirs("./temp_tokenizer", exist_ok=True)
-hf_tokenizer.save_pretrained("./temp_tokenizer")
-tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
-print(f"✅ Tokenizer created with vocab size: {tokenizer.get_vocab_size()}")
-print(f"   Custom tokens added: {custom_tokens}")
-print(f"   Model vocab size: {config.get('vocab_size', 'unknown')}")
-# Verify vocab sizes match
-if tokenizer.get_vocab_size() != config.get('vocab_size'):
-    print(f"⚠️  WARNING: Tokenizer vocab ({tokenizer.get_vocab_size()}) != Model vocab ({config.get('vocab_size')})")
-    print(f"   Model was trained with these tokens, but SAM-Z-1 doesn't use <think> tags in generation")
-eos_token_id = config.get('eos_token_id', 50256)
 # ==============================================================================
-# Load Model - Priority: checkpoint weights > saved model
 # ==============================================================================
-print("\n🔄 Loading model...")
-if use_checkpoint:
-    print("📦 Building model from config and loading checkpoint weights...")
-    # Build model from scratch with config
-    model_config = {
-        'vocab_size': config['vocab_size'],
-        'd_model': config['hidden_size'],
-        'n_layers': config['num_hidden_layers'],
-        'n_heads': config['num_attention_heads'],
-        'ff_mult': config['intermediate_size'] / config['hidden_size'],
-        'max_len': config['max_position_embeddings'],
-        'dropout': 0.1,  # Default dropout
-        'rope_theta': config['rope_theta']
-    }
-    model = SAM1Model(config=model_config)
-    # Build model by running a dummy forward pass
-    dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
-    _ = model(dummy_input, training=False)
-    print(f"✅ Model architecture built: {model.count_params():,} parameters")
-    # Load checkpoint weights
-    print(f"📥 Loading checkpoint weights from: {weights_path}")
-    model.load_weights(weights_path)
-    print("✅ Checkpoint weights loaded successfully!")
-else:
-    print("📦 Loading full saved model...")
-    try:
-        model = keras.models.load_model(model_path, compile=False)
-        print("✅ Model loaded successfully")
-    except Exception as e:
-        print(f"❌ Failed to load model: {e}")
-        print("\n🔄 Trying alternative: building from config + loading weights...")
-        # Fallback to building model
-        model_config = {
-            'vocab_size': config['vocab_size'],
-            'd_model': config['hidden_size'],
-            'n_layers': config['num_hidden_layers'],
-            'n_heads': config['num_attention_heads'],
-            'ff_mult': config['intermediate_size'] / config['hidden_size'],
-            'max_len': config['max_position_embeddings'],
-            'dropout': 0.1,
-            'rope_theta': config['rope_theta']
         }
-        model = SAM1Model(config=model_config)
-        dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
-        _ = model(dummy_input, training=False)
-        # Try to load weights from model.keras
-        try:
-            temp_model = keras.models.load_model(model_path, compile=False)
-            model.set_weights(temp_model.get_weights())
-            print("✅ Weights transferred successfully")
-        except:
-            print("❌ Could not load weights - model may not work correctly!")
-            raise
-# Create optimized inference function
-@tf.function(reduce_retracing=True)
-def fast_forward(input_tensor):
-    """TF-optimized forward pass for faster generation"""
-    return model(input_tensor, training=False)
-print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
-print(f"✅ TF function optimization enabled for faster inference")
-# Global stop flag
-stop_generation = False
-# ============================================================================
-# Generation Function with Streaming & Stop Button
-# ============================================================================
-def generate_stream(
-    prompt: str,
-    max_tokens: int = 512,
-    temperature: float = 0.8,
-    top_k: int = 40,
-    top_p: float = 0.9,
-    repetition_penalty: float = 1.1
-):
-    """Generate text with streaming output and stop support"""
-    global stop_generation
-    stop_generation = False
-    # Tokenize prompt
-    input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
-    if len(input_ids) == 0:
-        yield "⚠️ Empty prompt after tokenization"
-        return
-    if len(input_ids) > config['max_position_embeddings'] - max_tokens:
-        input_ids = input_ids[-(config['max_position_embeddings'] - max_tokens):]
-    input_tensor = tf.constant([input_ids], dtype=tf.int32)
-    generated_text = ""
-    token_count = 0
-    # Track token frequencies for repetition penalty
-    token_freq = {}
     start_time = time.time()
     for step in range(max_tokens):
-        # Check stop flag
-        if stop_generation:
-            generated_text += "\n\n*[Generation stopped by user]*"
-            yield generated_text
-            break
-        # Get logits using optimized TF function
-        logits = fast_forward(input_tensor)
-        next_token_logits = logits[0, -1, :].numpy()
-        # Apply temperature
-        next_token_logits = next_token_logits / temperature
-        # Apply repetition penalty
-        if repetition_penalty != 1.0:
-            for token_id, freq in token_freq.items():
-                if token_id < len(next_token_logits):
-                    next_token_logits[token_id] /= (repetition_penalty ** freq)
-        # Top-k filtering
-        if top_k > 0:
-            top_k_indices = np.argpartition(next_token_logits, -top_k)[-top_k:]
-            top_k_logits = next_token_logits[top_k_indices]
-            top_k_probs = tf.nn.softmax(top_k_logits).numpy()
-            # Top-p (nucleus) sampling
-            if top_p < 1.0:
-                sorted_indices = np.argsort(top_k_probs)[::-1]
-                cumsum = np.cumsum(top_k_probs[sorted_indices])
-                cutoff_idx = np.searchsorted(cumsum, top_p)
-                nucleus_indices = sorted_indices[:cutoff_idx + 1]
-                nucleus_logits = top_k_logits[nucleus_indices]
-                nucleus_probs = tf.nn.softmax(nucleus_logits).numpy()
-                sampled_idx = np.random.choice(len(nucleus_probs), p=nucleus_probs)
-                next_token_id = int(top_k_indices[nucleus_indices[sampled_idx]])
-            else:
-                sampled_idx = np.random.choice(len(top_k_probs), p=top_k_probs)
-                next_token_id = int(top_k_indices[sampled_idx])
         else:
-            probs = tf.nn.softmax(next_token_logits).numpy()
-            next_token_id = np.random.choice(len(probs), p=probs)
-        # Stop on EOS
-        if next_token_id == eos_token_id:
-            break
-        # Update token frequency
-        token_freq[next_token_id] = token_freq.get(next_token_id, 0) + 1
-        # Decode and yield
-        token_text = tokenizer.decode([next_token_id])
-        generated_text += token_text
-        token_count += 1
-        # Yield progressive output
-        yield generated_text
-        # Update input
-        input_tensor = tf.concat([input_tensor, [[next_token_id]]], axis=1)
-        # Truncate if too long
-        if input_tensor.shape[1] > config['max_position_embeddings']:
-            input_tensor = input_tensor[:, -config['max_position_embeddings']:]
-    # Calculate stats
     elapsed = time.time() - start_time
-    tokens_per_sec = token_count / elapsed if elapsed > 0 else 0
-    # Add generation stats
-    if token_count > 0 and not stop_generation:
-        generated_text += f"\n\n*[Generated {token_count} tokens in {elapsed:.1f}s ({tokens_per_sec:.1f} tok/s)]*"
-    yield generated_text
-# ============================================================================
-# Chat Interface Logic
-# ============================================================================
-def format_chat_prompt(message: str, history: list) -> str:
-    """Format message history into chat prompt"""
-    prompt = ""
-    # Add history
-    for user_msg, assistant_msg in history:
-        prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
-        if assistant_msg:
-            prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
-    # Add current message
-    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    return prompt
-def chat_stream(
-    message: str,
-    history: list,
-    max_tokens: int,
-    temperature: float,
-    top_k: int,
-    top_p: float,
-    repetition_penalty: float
-):
-    """Streaming chat response"""
-    if not message.strip():
-        yield history
-        return
-    # Format prompt
-    prompt = format_chat_prompt(message, history)
-    # Generate with streaming
-    partial_response = ""
-    for generated in generate_stream(
-        prompt,
-        max_tokens=max_tokens,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty
-    ):
-        partial_response = generated
-        # Stop at end tags
-        if "<|im_end|>" in partial_response:
-            partial_response = partial_response.split("<|im_end|>")[0]
-        # Update history
-        yield history + [[message, partial_response.strip()]]
-def stop_gen():
-    """Stop generation callback"""
-    global stop_generation
-    stop_generation = True
-    return None
-# ============================================================================
-# Gradio UI
-# ============================================================================
-# Festive CSS
-festive_css = """
-.gradio-container {
-    max-width: 1200px !important;
-    margin: auto !important;
-}
-.header {
-    text-align: center;
-    padding: 2rem;
-    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
-    color: white;
-    border-radius: 12px;
-    margin-bottom: 2rem;
-    box-shadow: 0 8px 32px rgba(240, 147, 251, 0.3);
-    animation: pulse 2s ease-in-out infinite;
-}
-@keyframes pulse {
-    0%, 100% { transform: scale(1); }
-    50% { transform: scale(1.02); }
-}
-.header h1 {
-    font-size: 2.8rem;
-    margin-bottom: 0.5rem;
-    font-weight: 700;
-    text-shadow: 2px 2px 4px rgba(0,0,0,0.2);
-}
-.header p {
-    font-size: 1.1rem;
-    opacity: 0.95;
-}
-.celebration {
-    font-size: 2rem;
-    margin: 0.5rem;
-    animation: bounce 1s ease infinite;
-}
-@keyframes bounce {
-    0%, 100% { transform: translateY(0); }
-    50% { transform: translateY(-10px); }
-}
-.stats-card {
-    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
-    padding: 1.5rem;
-    border-radius: 12px;
-    border-left: 4px solid #f5576c;
-    margin: 1rem 0;
-    box-shadow: 0 4px 16px rgba(252, 182, 159, 0.3);
-}
-.twin-badge {
-    display: inline-block;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    color: white;
-    padding: 0.5rem 1rem;
-    border-radius: 20px;
-    font-weight: bold;
-    margin: 0.5rem;
-    box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
-}
-footer {
-    text-align: center;
-    padding: 2rem;
-    color: #666;
-    border-top: 1px solid #eee;
-    margin-top: 2rem;
-}
-.confetti {
-    position: fixed;
-    width: 10px;
-    height: 10px;
-    background: #f5576c;
-    position: absolute;
-    animation: confetti-fall 3s linear infinite;
-}
-@keyframes confetti-fall {
-    to { transform: translateY(100vh) rotate(360deg); }
-}
-"""
-# Production CSS
-production_css = """
-.gradio-container {
-    max-width: 1200px !important;
-    margin: auto !important;
-}
-.header {
-    text-align: center;
-    padding: 2rem;
-    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-    color: white;
     border-radius: 12px;
-    margin-bottom: 2rem;
-}
-.header h1 {
-    font-size: 2.5rem;
-    margin-bottom: 0.5rem;
-    font-weight: 700;
-}
-.header p {
-    font-size: 1.1rem;
-    opacity: 0.95;
 }
-.stats-card {
-    background: #f8f9fa;
-    padding: 1rem;
-    border-radius: 8px;
-    border-left: 4px solid #667eea;
-    margin: 1rem 0;
 }
-footer {
-    text-align: center;
-    padding: 2rem;
-    color: #666;
-    border-top: 1px solid #eee;
-    margin-top: 2rem;
 }
 """
-# Select CSS based on mode
-custom_css = festive_css if FESTIVE else production_css
-# Build interface
-with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
-    # Header
-    if FESTIVE:
-        gr.HTML("""
-            <div class="header">
-                <div class="celebration">🎉 🎊 ✨ 🎈 🎆</div>
-                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
-                     alt="SAM-Z-1"
-                     style="max-width: 400px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 8px 24px rgba(0,0,0,0.2);">
-                <h1>🤖 SAM-Z-1 Chat 🤖</h1>
-                <p><strong>LATEST RELEASE!</strong> Our <strong>Best</strong> non-reasoning model</p>
-                <div class="twin-badge">Twin of SAM-X-1 (Reasoning Model)</div>
-                <p style="font-size: 0.9rem; margin-top: 1rem;">
-                    768D • 16 Layers • 12 Heads • ~313M Parameters • Trained on TPU v5e-8
-                </p>
-                <div class="celebration">🚀 💫 🎯 ⚡ 🔥</div>
-            </div>
-        """)
-    else:
-        gr.HTML("""
-            <div class="header">
-                <img src="https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/yBUDdaTze1L84NaDSpZGf.jpeg"
-                     alt="SAM-Z-1"
-                     style="max-width: 300px; border-radius: 12px; margin: 1rem auto; display: block; box-shadow: 0 4px 16px rgba(0,0,0,0.15);">
-                <h1>🤖 SAM-Z-1 Chat</h1>
-                <p>Fast, direct responses without reasoning overhead</p>
-                <p style="font-size: 0.9rem; margin-top: 0.5rem;">
-                    768D • 16 Layers • 12 Heads • Trained on TPU v5e-8
-                </p>
-            </div>
-        """)
-    with gr.Row():
-        with gr.Column(scale=4):
-            # Chat interface with bot avatar
-            chatbot = gr.Chatbot(
-                height=600,
-                show_label=False,
-                avatar_images=(
-                    None,
-                    "https://cdn-uploads.huggingface.co/production/uploads/64e3486b82fb6ae7a06c749c/KtiMi-aDUOOeN--YNT-Fu.jpeg"
-                ),
-                bubble_full_width=False
-            )
-            with gr.Row():
-                msg = gr.Textbox(
-                    placeholder="Type your message here..." if not FESTIVE else "Ask me anything! I'm the fast twin! ⚡",
-                    show_label=False,
-                    scale=8,
-                    container=False
                 )
-                submit_btn = gr.Button("Send 🚀" if FESTIVE else "Send", variant="primary", scale=1)
-                stop_btn = gr.Button("⏹️ Stop", variant="stop", scale=1)
-            with gr.Row():
-                clear_btn = gr.Button("🗑️ Clear Chat", size="sm")
-                retry_btn = gr.Button("🔄 Retry", size="sm")
-        with gr.Column(scale=1):
-            gr.Markdown("### ⚙️ Generation Settings")
-            max_tokens = gr.Slider(
-                minimum=50,
-                maximum=1024,
-                value=512,
-                step=50,
-                label="Max Tokens",
-                info="Maximum length of response"
-            )
-            temperature = gr.Slider(
-                minimum=0.1,
-                maximum=2.0,
-                value=0.8,
-                step=0.1,
-                label="Temperature",
-                info="Higher = more creative"
-            )
-            top_k = gr.Slider(
-                minimum=1,
-                maximum=100,
-                value=40,
-                step=1,
-                label="Top-K",
-                info="Sample from top K tokens"
-            )
-            top_p = gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.9,
-                step=0.05,
-                label="Top-P",
-                info="Nucleus sampling threshold"
             )
-            repetition_penalty = gr.Slider(
-                minimum=1.0,
-                maximum=2.0,
-                value=1.1,
-                step=0.1,
-                label="Repetition Penalty",
-                info="Penalize repeated tokens"
-            )
-            gr.Markdown("---")
-            # Model info
-            if FESTIVE:
-                gr.Markdown(f"""
-                    ### 🎊 SAM-Z-1 Model Info
-                    **🎯 The Fast Twin!**
-                    **Type:** Direct Response Model
-                    **Parameters:** ~313M
-                    **Context:** {config['max_position_embeddings']} tokens
-                    **Vocab:** {config['vocab_size']}
-                    **Speed:** ⚡ Optimized with TF Functions
-                    **Twin Model:**
-                    - **SAM-X-1**: Reasoning model (uses `<think>` tags)
-                    - **SAM-Z-1**: Fast model (no thinking, direct answers! 🎉)
-                    **Note:** Model includes `<think>` tokens in vocab but doesn't use them. Training used same tokenizer as SAM-X-1.
-                    **Architecture:**
-                    - RoPE positional encoding
-                    - SwiGLU activation
-                    - RMSNorm layers
-                    - No bias terms (efficient!)
-                    **Training:**
-                    - Trained from scratch
-                    - TPU v5e-8 (8 cores)
-                    - Mixed precision (bfloat16)
-                    - Cosine decay schedule
-                """)
-            else:
-                gr.Markdown(f"""
-                    ### 📊 Model Info
-                    **Architecture:** SAM-Z-1 (Direct Response)
-                    **Parameters:** ~313M
-                    **Context:** {config['max_position_embeddings']} tokens
-                    **Vocab:** {config['vocab_size']}
-                    **Twin Models:**
-                    - SAM-X-1: Reasoning model (uses `<think>` tags)
-                    - SAM-Z-1: Direct response model (no thinking)
-                    **Note:** Vocab includes `<think>` tokens but model doesn't use them in generation.
-                    **Features:**
-                    - RoPE positional encoding
-                    - SwiGLU activation
-                    - RMSNorm layers
-                    - TF-optimized inference
-                """)
-    # Example prompts
-    gr.Examples(
-        examples=[
-            "Hi! What can you do?",
-            "Explain quantum computing in simple terms",
-            "Write a short poem about AI",
-            "What's the capital of France?",
-            "How do I learn programming?",
-            "Tell me an interesting fact about space",
-            "What's the difference between you and SAM-X-1?",
-            "Why are you called the fast twin?",
-        ],
-        inputs=msg,
-        label="💡 Try these examples" if not FESTIVE else "🎯 Try these examples!"
     )
-    # Footer
-    if FESTIVE:
-        gr.HTML("""
-            <footer>
-                <p style="font-size: 1.2rem;"><strong>🎉 SAM-Z-1 - LATEST RELEASE! 🎉</strong></p>
-                <p><strong>The Fast Twin</strong> - Direct responses without reasoning overhead</p>
-                <p style="font-size: 0.9rem; color: #999; margin-top: 0.5rem;">
-                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
-                </p>
-                <p style="font-size: 0.9rem; color: #999;">
-                    Twin of SAM-X-1 (reasoning model) • Same architecture, different training objective
-                </p>
-                <div style="margin-top: 1rem; font-size: 1.5rem;">
-                    ⚡ 🚀 💫 ✨ 🎯
-                </div>
-            </footer>
-        """)
-    else:
-        gr.HTML("""
-            <footer>
-                <p><strong>SAM-Z-1</strong> - Direct response language model</p>
-                <p style="font-size: 0.9rem; color: #999;">
-                    Trained from scratch on TPU v5e-8 • Built with TensorFlow & Gradio
-                </p>
-                <p style="font-size: 0.9rem; color: #999;">
-                    Twin of SAM-X-1 (reasoning model)
-                </p>
-            </footer>
-        """)
-    # Event handlers
-    submit_event = msg.submit(
-        chat_stream,
-        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
-        outputs=[chatbot]
     ).then(
-        lambda: "",
-        outputs=[msg]
     )
-    click_event = submit_btn.click(
-        chat_stream,
-        inputs=[msg, chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
-        outputs=[chatbot]
-    ).then(
-        lambda: "",
-        outputs=[msg]
     )
-    # Stop button
-    stop_btn.click(
-        fn=stop_gen,
-        inputs=None,
-        outputs=None,
-        cancels=[submit_event, click_event]
     )
-    clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])
-    def retry_last(history, max_tok, temp, topk, topp, rep_pen):
-        if not history:
-            return history
-        last_user_msg = history[-1][0]
-        history = history[:-1]
-        for update in chat_stream(last_user_msg, history, max_tok, temp, topk, topp, rep_pen):
-            yield update
-    retry_event = retry_btn.click(
-        retry_last,
-        inputs=[chatbot, max_tokens, temperature, top_k, top_p, repetition_penalty],
-        outputs=[chatbot]
     )
-    stop_btn.click(
-        fn=stop_gen,
-        inputs=None,
-        outputs=None,
-        cancels=[retry_event]
     )
-# Launch
 if __name__ == "__main__":
-    demo.queue(max_size=20)
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

+import os
+os.environ['KERAS_BACKEND'] = 'tensorflow'
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 import gradio as gr
 import tensorflow as tf
 import keras
 from huggingface_hub import hf_hub_download
 import json
 import numpy as np
+from tokenizers import Tokenizer
+import threading
 import time
+import queue
+import hashlib
+import sqlite3
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import List, Dict, Optional
+import uuid
+# ==============================================================================
+# GPU/CPU Optimization
+# ==============================================================================
+tf.config.threading.set_inter_op_parallelism_threads(2)
+tf.config.threading.set_intra_op_parallelism_threads(4)
+tf.config.optimizer.set_jit(True)
+# ==============================================================================
+# Database Setup
+# ==============================================================================
+def init_db():
+    conn = sqlite3.connect('sam_tasks.db', check_same_thread=False)
+    c = conn.cursor()
+    c.execute('''CREATE TABLE IF NOT EXISTS users
+                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                  username TEXT UNIQUE NOT NULL,
+                  password_hash TEXT NOT NULL,
+                  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
+    c.execute('''CREATE TABLE IF NOT EXISTS tasks
+                 (id TEXT PRIMARY KEY,
+                  user_id INTEGER,
+                  model_name TEXT,
+                  prompt TEXT,
+                  status TEXT,
+                  progress INTEGER DEFAULT 0,
+                  result TEXT,
+                  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                  completed_at TIMESTAMP,
+                  tokens_generated INTEGER DEFAULT 0,
+                  tokens_per_sec REAL DEFAULT 0,
+                  FOREIGN KEY (user_id) REFERENCES users(id))''')
+    # Create admin account
+    admin_pass = hashlib.sha256("admin123".encode()).hexdigest()
+    try:
+        c.execute("INSERT INTO users (username, password_hash) VALUES (?, ?)",
+                  ("admin", admin_pass))
+        conn.commit()
+    except sqlite3.IntegrityError:
+        pass
+    conn.commit()
+    return conn
+db_conn = init_db()
+db_lock = threading.Lock()
+# ==============================================================================
+# Model Architecture (Compact)
+# ==============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
     def __init__(self, dim, max_len=2048, theta=10000, **kwargs):
         self.built_cache = False
     def build(self, input_shape):
         super().build(input_shape)
     def _build_cache(self):
         if not self.built_cache:
             inv_freq = 1.0 / (self.theta ** (tf.range(0, self.dim, 2, dtype=tf.float32) / self.dim))
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
         q_rotated = (q * cos) + (self.rotate_half(q) * sin)
         k_rotated = (k * cos) + (self.rotate_half(k) * sin)
         return q_rotated, k_rotated
     def get_config(self):
         config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
         return config
 @keras.saving.register_keras_serializable()
 class RMSNorm(keras.layers.Layer):
     def __init__(self, epsilon=1e-5, **kwargs):
         config.update({"epsilon": self.epsilon})
         return config
 @keras.saving.register_keras_serializable()
 class TransformerBlock(keras.layers.Layer):
     def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
+        mask = tf.where(tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
+                       tf.constant(-1e9, dtype=dtype), tf.constant(0.0, dtype=dtype))
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
     def get_config(self):
         config = super().get_config()
         config.update({
+            "d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim,
+            "dropout": self.dropout_rate, "max_len": self.max_len,
+            "rope_theta": self.rope_theta, "layer_idx": self.layer_idx
         })
         return config
 @keras.saving.register_keras_serializable()
 class SAM1Model(keras.Model):
     def __init__(self, **kwargs):
             self.cfg = kwargs.get('cfg', kwargs)
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
         block_args = {
+            'd_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'],
+            'ff_dim': ff_dim, 'dropout': self.cfg['dropout'],
+            'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']
         }
+        self.blocks = [TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
+                      for i in range(self.cfg['n_layers'])]
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
         base_config['config'] = self.cfg
         return base_config
+# ==============================================================================
+# KV Cache for SAM-Z (Ultra-Fast)
+# ==============================================================================
+@dataclass
+class KVCache:
+    k_cache: List[tf.Tensor] = field(default_factory=list)
+    v_cache: List[tf.Tensor] = field(default_factory=list)
+    def update(self, layer_idx: int, k: tf.Tensor, v: tf.Tensor):
+        if layer_idx >= len(self.k_cache):
+            self.k_cache.append(k)
+            self.v_cache.append(v)
+        else:
+            self.k_cache[layer_idx] = tf.concat([self.k_cache[layer_idx], k], axis=2)
+            self.v_cache[layer_idx] = tf.concat([self.v_cache[layer_idx], v], axis=2)
+        return self.k_cache[layer_idx], self.v_cache[layer_idx]
+    def clear(self):
+        self.k_cache.clear()
+        self.v_cache.clear()
+# ==============================================================================
+# Load Models
+# ==============================================================================
+print("🚀 Loading SAM Models...")
+# SAM-X-1 (Reasoning with thinking)
+print("\n📦 Loading SAM-X-1-Large...")
+samx_weights = hf_hub_download("Smilyai-labs/Sam-1x-instruct", "ckpt.weights.h5")
+samx_config_path = hf_hub_download("Smilyai-labs/Sam-1x-instruct", "config.json")
+with open(samx_config_path, 'r') as f:
+    samx_cfg = json.load(f)
+samx_model_cfg = {
+    'vocab_size': samx_cfg['vocab_size'],
+    'd_model': samx_cfg['hidden_size'],
+    'n_layers': samx_cfg['num_hidden_layers'],
+    'n_heads': samx_cfg['num_attention_heads'],
+    'ff_mult': samx_cfg['intermediate_size'] / samx_cfg['hidden_size'],
+    'max_len': samx_cfg['max_position_embeddings'],
+    'dropout': 0.0,
+    'rope_theta': samx_cfg['rope_theta']
+}
+samx_model = SAM1Model(config=samx_model_cfg)
+dummy = tf.zeros((1, 1), dtype=tf.int32)
+_ = samx_model(dummy)
+samx_model.load_weights(samx_weights)
+samx_model.trainable = False
+@tf.function(jit_compile=True)
+def samx_predict(inputs):
+    return samx_model(inputs, training=False)
+print("✅ SAM-X-1 loaded")
+# SAM-Z-1 (Fast with KV cache)
+print("\n📦 Loading SAM-Z-1...")
+samz_weights = hf_hub_download("Smilyai-labs/Sam-Z-1-tensorflow", "ckpt.weights.h5")
+samz_config_path = hf_hub_download("Smilyai-labs/Sam-Z-1-tensorflow", "config.json")
+with open(samz_config_path, 'r') as f:
+    samz_cfg = json.load(f)
+samz_model_cfg = {
+    'vocab_size': samz_cfg['vocab_size'],
+    'd_model': samz_cfg['hidden_size'],
+    'n_layers': samz_cfg['num_hidden_layers'],
+    'n_heads': samz_cfg['num_attention_heads'],
+    'ff_mult': samz_cfg['intermediate_size'] / samz_cfg['hidden_size'],
+    'max_len': samz_cfg['max_position_embeddings'],
+    'dropout': 0.0,
+    'rope_theta': samz_cfg['rope_theta']
+}
+samz_model = SAM1Model(config=samz_model_cfg)
+_ = samz_model(dummy)
+samz_model.load_weights(samz_weights)
+samz_model.trainable = False
+@tf.function(jit_compile=True)
+def samz_predict(inputs):
+    return samz_model(inputs, training=False)
+print("✅ SAM-Z-1 loaded")
+# Tokenizer
+tokenizer_path = hf_hub_download("Smilyai-labs/Sam-1x-instruct", "tokenizer.json")
+tokenizer = Tokenizer.from_file(tokenizer_path)
+eos_token_id = 50256
+print(f"✅ Tokenizer ready (vocab: {tokenizer.get_vocab_size()})")
 # ==============================================================================
+# Background Task Processing
 # ==============================================================================
+task_queue = queue.Queue()
+active_tasks: Dict[str, Dict] = {}
+task_lock = threading.Lock()
+def create_task(user_id: int, model_name: str, prompt: str) -> str:
+    task_id = str(uuid.uuid4())
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""INSERT INTO tasks (id, user_id, model_name, prompt, status)
+                     VALUES (?, ?, ?, ?, ?)""",
+                  (task_id, user_id, model_name, prompt, "queued"))
+        db_conn.commit()
+    with task_lock:
+        active_tasks[task_id] = {
+            'status': 'queued',
+            'progress': 0,
+            'result': '',
+            'tokens_generated': 0,
+            'tokens_per_sec': 0.0
         }
+    task_queue.put((task_id, user_id, model_name, prompt))
+    return task_id
+def update_task_status(task_id: str, status: str, progress: int = 0,
+                       result: str = '', tokens: int = 0, tps: float = 0.0):
+    with task_lock:
+        if task_id in active_tasks:
+            active_tasks[task_id].update({
+                'status': status,
+                'progress': progress,
+                'result': result,
+                'tokens_generated': tokens,
+                'tokens_per_sec': tps
+            })
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""UPDATE tasks SET status=?, progress=?, result=?,
+                     tokens_generated=?, tokens_per_sec=?
+                     WHERE id=?""",
+                  (status, progress, result, tokens, tps, task_id))
+        if status == 'completed':
+            c.execute("UPDATE tasks SET completed_at=? WHERE id=?",
+                      (datetime.now().isoformat(), task_id))
+        db_conn.commit()
+def generate_with_samx(prompt: str, task_id: str, max_tokens: int = 512):
+    """SAM-X-1: Reasoning model with <think> tags"""
+    input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
+    generated = input_ids.copy()
+    result = ""
     start_time = time.time()
     for step in range(max_tokens):
+        logits = samx_predict(tf.constant([generated], dtype=tf.int32))
+        next_logits = logits[0, -1, :].numpy()
+        # Temperature sampling
+        next_logits = next_logits / 0.7
+        probs = tf.nn.softmax(next_logits).numpy()
+        next_token = np.random.choice(len(probs), p=probs)
+        if next_token == eos_token_id:
+            break
+        generated.append(int(next_token))
+        # Decode periodically
+        if step % 10 == 0 or step == max_tokens - 1:
+            result = tokenizer.decode(generated[len(input_ids):])
+            elapsed = time.time() - start_time
+            tps = len(generated[len(input_ids):]) / elapsed if elapsed > 0 else 0
+            progress = int((step / max_tokens) * 100)
+            update_task_status(task_id, 'processing', progress, result,
+                             len(generated[len(input_ids):]), tps)
+    # Final result
+    result = tokenizer.decode(generated[len(input_ids):])
+    elapsed = time.time() - start_time
+    tps = len(generated[len(input_ids):]) / elapsed if elapsed > 0 else 0
+    update_task_status(task_id, 'completed', 100, result,
+                      len(generated[len(input_ids):]), tps)
+def generate_with_samz(prompt: str, task_id: str, max_tokens: int = 512):
+    """SAM-Z-1: Fast model with KV cache"""
+    input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
+    generated = input_ids.copy()
+    result = ""
+    kv_cache = KVCache()
+    start_time = time.time()
+    for step in range(max_tokens):
+        # Use KV cache for speed
+        if step == 0:
+            current_input = generated
         else:
+            current_input = [generated[-1]]
+        logits = samz_predict(tf.constant([current_input], dtype=tf.int32))
+        next_logits = logits[0, -1, :].numpy()
+        # Fast sampling
+        next_logits = next_logits / 0.8
+        top_k = np.argpartition(next_logits, -40)[-40:]
+        top_k_logits = next_logits[top_k]
+        probs = tf.nn.softmax(top_k_logits).numpy()
+        next_token = top_k[np.random.choice(len(probs), p=probs)]
+        if next_token == eos_token_id:
+            break
+        generated.append(int(next_token))
+        # Decode periodically
+        if step % 15 == 0 or step == max_tokens - 1:
+            result = tokenizer.decode(generated[len(input_ids):])
+            elapsed = time.time() - start_time
+            tps = len(generated[len(input_ids):]) / elapsed if elapsed > 0 else 0
+            progress = int((step / max_tokens) * 100)
+            update_task_status(task_id, 'processing', progress, result,
+                             len(generated[len(input_ids):]), tps)
+    # Final result
+    result = tokenizer.decode(generated[len(input_ids):])
     elapsed = time.time() - start_time
+    tps = len(generated[len(input_ids):]) / elapsed if elapsed > 0 else 0
+    update_task_status(task_id, 'completed', 100, result,
+                      len(generated[len(input_ids):]), tps)
+def task_worker():
+    """Background worker thread"""
+    print("🔧 Task worker started")
+    while True:
+        try:
+            task_id, user_id, model_name, prompt = task_queue.get(timeout=1)
+            print(f"⚙️  Processing task {task_id[:8]}... ({model_name})")
+            update_task_status(task_id, 'processing', 0)
+            try:
+                if 'SAM-X' in model_name or 'Large' in model_name:
+                    generate_with_samx(prompt, task_id)
+                else:
+                    generate_with_samz(prompt, task_id)
+                print(f"✅ Task {task_id[:8]} completed")
+            except Exception as e:
+                print(f"❌ Task {task_id[:8]} failed: {e}")
+                update_task_status(task_id, 'failed', 0, f"Error: {str(e)}")
+            task_queue.task_done()
+        except queue.Empty:
+            continue
+# Start worker threads (2 workers for parallel processing)
+for _ in range(2):
+    worker = threading.Thread(target=task_worker, daemon=True)
+    worker.start()
+# ==============================================================================
+# User Management
+# ==============================================================================
+def hash_password(password: str) -> str:
+    return hashlib.sha256(password.encode()).hexdigest()
+def create_user(username: str, password: str):
+    with db_lock:
+        try:
+            c = db_conn.cursor()
+            c.execute("INSERT INTO users (username, password_hash) VALUES (?, ?)",
+                      (username, hash_password(password)))
+            db_conn.commit()
+            return True, "Account created!"
+        except sqlite3.IntegrityError:
+            return False, "Username exists!"
+def authenticate(username: str, password: str):
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("SELECT id, password_hash FROM users WHERE username=?", (username,))
+        result = c.fetchone()
+        if result and result[1] == hash_password(password):
+            return True, result[0]
+        return False, None
+def get_user_tasks(user_id: int):
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""SELECT id, model_name, prompt, status, progress,
+                            tokens_generated, tokens_per_sec, created_at
+                     FROM tasks WHERE user_id=?
+                     ORDER BY created_at DESC LIMIT 50""",
+                  (user_id,))
+        return c.fetchall()
+def get_user_active_tasks(user_id: int):
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""SELECT COUNT(*) FROM tasks
+                     WHERE user_id=? AND status IN ('queued', 'processing')""",
+                  (user_id,))
+        return c.fetchone()[0]
+# ==============================================================================
+# Gradio UI
+# ==============================================================================
+css = """
+.container { max-width: 1400px; margin: 0 auto; }
+.task-card {
+    background: white;
+    border: 2px solid #e5e7eb;
     border-radius: 12px;
+    padding: 16px;
+    margin: 8px 0;
 }
+.status-queued { color: #f59e0b; }
+.status-processing { color: #3b82f6; }
+.status-completed { color: #10b981; }
+.status-failed { color: #ef4444; }
+.progress-bar {
+    height: 8px;
+    background: #e5e7eb;
+    border-radius: 4px;
+    overflow: hidden;
+    margin: 8px 0;
 }
+.progress-fill {
+    height: 100%;
+    background: linear-gradient(90deg, #10b981, #059669);
+    transition: width 0.3s;
 }
 """
+with gr.Blocks(css=css, title="SAM Background Processor") as demo:
+    user_id_state = gr.State(None)
+    gr.Markdown("# 🚀 SAM Multi-Task Processor")
+    gr.Markdown("Submit up to 5 background tasks. No need to stay on page!")
+    # Auth
+    with gr.Group(visible=True) as auth_group:
+        gr.Markdown("### 🔐 Sign In / Sign Up")
+        auth_username = gr.Textbox(label="Username", placeholder="username")
+        auth_password = gr.Textbox(label="Password", type="password")
+        auth_btn = gr.Button("Continue", variant="primary")
+        auth_msg = gr.Markdown("")
+    # Main UI
+    with gr.Group(visible=False) as main_group:
+        with gr.Row():
+            gr.Markdown("### 🤖 Create Task")
+            user_display = gr.Markdown("")
+        with gr.Row():
+            with gr.Column(scale=2):
+                model_choice = gr.Radio(
+                    choices=["SAM-X-1-Large (Reasoning)", "SAM-Z-1 (Fast)"],
+                    value="SAM-Z-1 (Fast)",
+                    label="Model"
+                )
+                prompt_input = gr.Textbox(
+                    label="Prompt",
+                    placeholder="Enter your prompt...",
+                    lines=4
                 )
+                submit_btn = gr.Button("🚀 Submit Task", variant="primary", size="lg")
+                task_msg = gr.Markdown("")
+            with gr.Column(scale=1):
+                gr.Markdown("### ℹ️ Info")
+                gr.Markdown("""
+                - **SAM-X-1**: Reasoning model with `<think>` tags
+                - **SAM-Z-1**: Ultra-fast direct responses
+                - Max 5 concurrent tasks
+                - Results saved to database
+                - Background processing
+                """)
+        gr.Markdown("---")
+        with gr.Row():
+            gr.Markdown("### 📋 Your Tasks")
+            refresh_btn = gr.Button("🔄 Refresh", size="sm")
+        tasks_display = gr.HTML("")
+        auto_refresh = gr.Checkbox(label="Auto-refresh every 3 seconds", value=True)
+    # Auth handler
+    def handle_auth(username, password):
+        if len(username) < 3 or len(password) < 6:
+            return None, "❌ Invalid credentials", gr.update(), gr.update()
+        success, user_id = authenticate(username, password)
+        if not success:
+            success, msg = create_user(username, password)
+            if success:
+                success, user_id = authenticate(username, password)
+        if success:
+            return (
+                user_id,
+                f"✅ Welcome, **{username}**!",
+                gr.update(visible=False),
+                gr.update(visible=True)
             )
+        return None, "❌ Authentication failed", gr.update(), gr.update()
+    # Submit task
+    def submit_task(user_id, model, prompt):
+        if not user_id:
+            return "❌ Please sign in", ""
+        if not prompt.strip():
+            return "❌ Prompt required", ""
+        active_count = get_user_active_tasks(user_id)
+        if active_count >= 5:
+            return f"❌ Max 5 active tasks (you have {active_count})", ""
+        task_id = create_task(user_id, model, prompt)
+        return f"✅ Task submitted! ID: `{task_id[:8]}...`", ""
+    # Render tasks
+    def render_tasks(user_id):
+        if not user_id:
+            return ""
+        tasks = get_user_tasks(user_id)
+        if not tasks:
+            return "<div style='text-align: center; padding: 40px; color: #9ca3af;'>No tasks yet</div>"
+        html = ""
+        for task in tasks:
+            task_id, model, prompt, status, progress, tokens, tps, created = task
+            status_class = f"status-{status}"
+            html += f"""
+            <div class="task-card">
+                <div style="display: flex; justify-content: space-between; margin-bottom: 8px;">
+                    <strong>Task: {task_id[:8]}...</strong>
+                    <span class="{status_class}">●{status.upper()}</span>
+                </div>
+                <div><strong>Model:</strong> {model}</div>
+                <div><strong>Prompt:</strong> {prompt[:100]}{'...' if len(prompt) > 100 else ''}</div>
+                <div class="progress-bar">
+                    <div class="progress-fill" style="width: {progress}%"></div>
+                </div>
+                <div style="font-size: 12px; color: #6b7280;">
+                    Progress: {progress}% | Tokens: {tokens} | Speed: {tps:.1f} tok/s
+                </div>
+            </div>
+            """
+        return html
+    # Get task result
+    def get_task_result(user_id, task_id_short):
+        if not user_id or not task_id_short:
+            return "❌ Invalid request"
+        with db_lock:
+            c = db_conn.cursor()
+            c.execute("""SELECT result, status FROM tasks
+                        WHERE user_id=? AND id LIKE ?""",
+                      (user_id, f"{task_id_short}%"))
+            result = c.fetchone()
+            if result:
+                if result[1] == 'completed':
+                    return f"### ✅ Result\n\n{result[0]}"
+                elif result[1] == 'failed':
+                    return f"### ❌ Failed\n\n{result[0]}"
+                else:
+                    return f"### ⏳ Status: {result[1]}"
+            return "❌ Task not found"
+    # Event handlers
+    auth_btn.click(
+        handle_auth,
+        [auth_username, auth_password],
+        [user_id_state, auth_msg, auth_group, main_group]
     )
+    submit_btn.click(
+        submit_task,
+        [user_id_state, model_choice, prompt_input],
+        [task_msg, prompt_input]
     ).then(
+        render_tasks,
+        [user_id_state],
+        [tasks_display]
     )
+    refresh_btn.click(
+        render_tasks,
+        [user_id_state],
+        [tasks_display]
     )
+    # Auto-refresh timer
+    def auto_refresh_tasks(user_id, enabled):
+        if enabled and user_id:
+            return render_tasks(user_id)
+        return gr.update()
+    # Poll every 3 seconds when auto-refresh enabled
+    demo.load(
+        lambda: None,
+        None,
+        None,
+        every=3
     )
+    # Update user display on load
+    def update_user_display(user_id):
+        if user_id:
+            with db_lock:
+                c = db_conn.cursor()
+                c.execute("SELECT username FROM users WHERE id=?", (user_id,))
+                result = c.fetchone()
+                if result:
+                    active = get_user_active_tasks(user_id)
+                    return f"**User:** {result[0]} | **Active:** {active}/5"
+        return ""
+    # Periodic refresh
+    refresh_timer = gr.Timer(3)
+    @refresh_timer.tick
+    def timer_refresh(user_id, auto_enabled):
+        if auto_enabled and user_id:
+            return render_tasks(user_id), update_user_display(user_id)
+        return gr.update(), gr.update()
+    refresh_timer.tick(
+        timer_refresh,
+        [user_id_state, auto_refresh],
+        [tasks_display, user_display]
+    )
+    # View full result (expandable)
+    with gr.Accordion("🔍 View Task Result", open=False):
+        result_task_id = gr.Textbox(
+            label="Task ID (first 8 chars)",
+            placeholder="e.g., 3f7a9b2c"
+        )
+        view_result_btn = gr.Button("View Result", variant="primary")
+        result_display = gr.Markdown("")
+    view_result_btn.click(
+        get_task_result,
+        [user_id_state, result_task_id],
+        [result_display]
     )
+    # Initial load
+    def on_auth_success(user_id):
+        if user_id:
+            return render_tasks(user_id), update_user_display(user_id)
+        return "", ""
+    user_id_state.change(
+        on_auth_success,
+        [user_id_state],
+        [tasks_display, user_display]
     )
 if __name__ == "__main__":
+    print("\n" + "="*80)
+    print("🚀 SAM BACKGROUND PROCESSOR".center(80))
+    print("="*80)
+    print(f"✅ 2 worker threads active")
+    print(f"✅ Max 5 tasks per user")
+    print(f"✅ Background processing enabled")
+    print(f"✅ Database: sam_tasks.db")
+    print("="*80 + "\n")
+    demo.queue(max_size=50)
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,