Spaces:

Smilyai-labs
/

Sam-X-series-chat

Running

App Files Files Community

Keeby-smilyai commited on Oct 21

Commit

7f368dd

verified ·

1 Parent(s): 4e366fc

Update app.py

Browse files

Files changed (1) hide show

app.py +384 -739

app.py CHANGED Viewed

@@ -11,26 +11,371 @@ import json
 from abc import ABC, abstractmethod
 import time
 import threading
 # ==============================================================================
 # Performance Optimizations for CPU
 # ==============================================================================
-# Set TensorFlow to use fewer threads (better for 2vCPU)
 tf.config.threading.set_inter_op_parallelism_threads(1)
 tf.config.threading.set_intra_op_parallelism_threads(2)
-# Enable XLA compilation for faster execution
 tf.config.optimizer.set_jit(True)
-# Disable eager execution for better performance
 tf.config.run_functions_eagerly(False)
-# Memory optimization
 os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
 os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
 # ==============================================================================
-# Model Architecture (Must match training code)
 # ==============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
@@ -47,7 +392,6 @@ class RotaryEmbedding(keras.layers.Layer):
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
             self.cos_cached = tf.constant(tf.cos(emb), dtype=tf.float32)
             self.sin_cached = tf.constant(tf.sin(emb), dtype=tf.float32)
             self.built_cache = True
@@ -62,10 +406,8 @@ class RotaryEmbedding(keras.layers.Layer):
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
         q_rotated = (q * cos) + (self.rotate_half(q) * sin)
         k_rotated = (k * cos) + (self.rotate_half(k) * sin)
         return q_rotated, k_rotated
     def get_config(self):
@@ -73,7 +415,6 @@ class RotaryEmbedding(keras.layers.Layer):
         config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
         return config
 @keras.saving.register_keras_serializable()
 class RMSNorm(keras.layers.Layer):
     def __init__(self, epsilon=1e-5, **kwargs):
@@ -92,7 +433,6 @@ class RMSNorm(keras.layers.Layer):
         config.update({"epsilon": self.epsilon})
         return config
 @keras.saving.register_keras_serializable()
 class TransformerBlock(keras.layers.Layer):
     def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
@@ -105,68 +445,47 @@ class TransformerBlock(keras.layers.Layer):
         self.rope_theta = rope_theta
         self.head_dim = d_model // n_heads
         self.layer_idx = layer_idx
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
         q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
-        mask = tf.where(
-            tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0,
-            tf.constant(-1e9, dtype=dtype),
-            tf.constant(0.0, dtype=dtype)
-        )
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
         return res + self.dropout(ffn, training=training)
     def get_config(self):
         config = super().get_config()
-        config.update({
-            "d_model": self.d_model,
-            "n_heads": self.n_heads,
-            "ff_dim": self.ff_dim,
-            "dropout": self.dropout_rate,
-            "max_len": self.max_len,
-            "rope_theta": self.rope_theta,
-            "layer_idx": self.layer_idx
-        })
-        return config
 @keras.saving.register_keras_serializable()
 class SAM1Model(keras.Model):
@@ -178,33 +497,20 @@ class SAM1Model(keras.Model):
             self.cfg = kwargs
         else:
             self.cfg = kwargs.get('cfg', kwargs)
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
-        block_args = {
-            'd_model': self.cfg['d_model'],
-            'n_heads': self.cfg['n_heads'],
-            'ff_dim': ff_dim,
-            'dropout': self.cfg['dropout'],
-            'max_len': self.cfg['max_len'],
-            'rope_theta': self.cfg['rope_theta']
-        }
         self.blocks = []
         for i in range(self.cfg['n_layers']):
             block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
             self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
@@ -212,25 +518,16 @@ class SAM1Model(keras.Model):
         base_config['config'] = self.cfg
         return base_config
-# ==============================================================================
-# Helper Functions
-# ==============================================================================
 def count_parameters(model):
-    """Count total and non-zero parameters in model."""
     total_params = 0
     non_zero_params = 0
     for weight in model.weights:
         w = weight.numpy()
         total_params += w.size
         non_zero_params += np.count_nonzero(w)
     return total_params, non_zero_params
 def format_param_count(count):
-    """Format parameter count in human readable format."""
     if count >= 1e9:
         return f"{count/1e9:.2f}B"
     elif count >= 1e6:
@@ -240,53 +537,34 @@ def format_param_count(count):
     else:
         return str(count)
-# ==============================================================================
-# Model Backend Interface
-# ==============================================================================
 class ModelBackend(ABC):
     @abstractmethod
     def predict(self, input_ids):
         pass
     @abstractmethod
     def get_name(self):
         pass
     @abstractmethod
     def get_info(self):
         pass
 class KerasBackend(ModelBackend):
     def __init__(self, model, name, display_name):
         self.model = model
         self.name = name
         self.display_name = display_name
-        # Pre-compile predict function for faster inference
-        @tf.function(
-            input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)],
-            jit_compile=True
-        )
         def fast_predict(inputs):
             return model(inputs, training=False)
         self.fast_predict = fast_predict
-        # Warm up compilation with dummy input
         print(f"   🔥 Warming up {display_name}...")
         dummy = tf.constant([[1, 2, 3]], dtype=tf.int32)
         _ = self.fast_predict(dummy)
         print(f"   ✅ Compilation complete!")
-        # Count parameters
         total, non_zero = count_parameters(model)
         self.total_params = total
         self.non_zero_params = non_zero
         self.sparsity = (1 - non_zero / total) * 100 if total > 0 else 0
-        # Calculate actual model config for speed estimation
         self.n_heads = model.cfg.get('n_heads', 0)
         self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
@@ -307,10 +585,6 @@ class KerasBackend(ModelBackend):
             info += f"  Sparsity: {self.sparsity:.1f}%\n"
         return info
-# ==============================================================================
-# EASY MODEL REGISTRY - ADD YOUR MODELS HERE!
-# ==============================================================================
 MODEL_REGISTRY = [
     ("SAM-X-1-Large", "Smilyai-labs/Sam-1x-instruct", "ckpt.weights.h5", None),
     ("SAM-X-1-Fast ⚡ (BETA)", "Smilyai-labs/Sam-X-1-fast", "sam1_fast.weights.h5", "sam1_fast_config.json"),
@@ -318,22 +592,9 @@ MODEL_REGISTRY = [
     ("SAM-X-1-Nano ⚡⚡", "Smilyai-labs/Sam-X-1-Nano", "sam1_nano_finetuned.weights.h5", "sam1_nano_finetuned_config.json"),
 ]
-# Model complexity scores for auto-selection (higher = more capable)
-MODEL_COMPLEXITY = {
-    "SAM-X-1-Nano ⚡⚡": 1,
-    "SAM-X-1-Mini 🚀 (ADVANCED!)": 2,
-    "SAM-X-1-Fast ⚡ (BETA)": 3,
-    "SAM-X-1-Large": 4
-}
 def estimate_prompt_complexity(prompt):
-    """Estimate prompt complexity to choose appropriate model."""
     prompt_lower = prompt.lower()
-    # Count complexity indicators
     complexity_score = 0
-    # Length-based complexity
     word_count = len(prompt.split())
     if word_count > 100:
         complexity_score += 3
@@ -341,52 +602,28 @@ def estimate_prompt_complexity(prompt):
         complexity_score += 2
     elif word_count > 20:
         complexity_score += 1
-    # Hard reasoning keywords (need Large/Fast)
-    hard_keywords = [
-        'analyze', 'explain', 'compare', 'evaluate', 'prove', 'derive',
-        'calculate', 'solve', 'reason', 'why', 'how does', 'complex',
-        'algorithm', 'mathematics', 'philosophy', 'theory', 'logic',
-        'detailed', 'comprehensive', 'thorough', 'in-depth'
-    ]
     for keyword in hard_keywords:
         if keyword in prompt_lower:
             complexity_score += 2
-    # Medium complexity keywords (need Mini/Fast)
-    medium_keywords = [
-        'write', 'create', 'generate', 'summarize', 'describe',
-        'list', 'what is', 'tell me', 'explain briefly'
-    ]
     for keyword in medium_keywords:
         if keyword in prompt_lower:
             complexity_score += 1
-    # Code-related (usually complex)
     if any(word in prompt_lower for word in ['code', 'function', 'program', 'debug', 'implement']):
         complexity_score += 2
-    # Multi-step or multi-part questions
     if any(word in prompt_lower for word in ['first', 'then', 'next', 'finally', 'step']):
         complexity_score += 1
-    # Questions with multiple parts
     question_marks = prompt.count('?')
     if question_marks > 1:
         complexity_score += 1
     return complexity_score
-def select_model_auto(prompt, available_models):
-    """Automatically select best model based on prompt complexity."""
     complexity = estimate_prompt_complexity(prompt)
-    # Map complexity to model choice
-    # 0-2: Simple questions -> Nano (fastest)
-    # 3-5: Medium questions -> Mini (balanced)
-    # 6-8: Complex questions -> Fast (capable)
-    # 9+: Very complex -> Large (most capable)
     if complexity <= 2:
         preferred = "SAM-X-1-Nano ⚡⚡"
         fallback_order = ["SAM-X-1-Mini 🚀 (ADVANCED!)", "SAM-X-1-Fast ⚡ (BETA)", "SAM-X-1-Large"]
@@ -399,208 +636,114 @@ def select_model_auto(prompt, available_models):
     else:
         preferred = "SAM-X-1-Large"
         fallback_order = ["SAM-X-1-Fast ⚡ (BETA)", "SAM-X-1-Mini 🚀 (ADVANCED!)", "SAM-X-1-Nano ⚡⚡"]
-    # Try preferred model first
-    if preferred in available_models:
-        print(f"   🎯 Auto-selected {preferred} (complexity: {complexity})")
-        return available_models[preferred]
-    # Fallback to next best available
     for model_name in fallback_order:
-        if model_name in available_models:
-            print(f"   🎯 Auto-selected {model_name} (fallback, complexity: {complexity})")
-            return available_models[model_name]
-    # Last resort: return any available model
-    return list(available_models.values())[0]
-# ==============================================================================
-# Load Models
-# ==============================================================================
 CONFIG_TOKENIZER_REPO_ID = "Smilyai-labs/Sam-1-large-it-0002"
 print("="*80)
 print("🤖 SAM-X-1 Multi-Model Chat Interface".center(80))
 print("="*80)
-# Download config and tokenizer
 print(f"\n📦 Downloading config/tokenizer from: {CONFIG_TOKENIZER_REPO_ID}")
 config_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="config.json")
 tokenizer_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="tokenizer.json")
-# Load config
 with open(config_path, 'r') as f:
     base_config = json.load(f)
 print(f"✅ Base config loaded")
-# Build base model config
-base_model_config = {
-    'vocab_size': base_config['vocab_size'],
-    'd_model': base_config['hidden_size'],
-    'n_heads': base_config['num_attention_heads'],
-    'ff_mult': base_config['intermediate_size'] / base_config['hidden_size'],
-    'dropout': base_config.get('dropout', 0.0),
-    'max_len': base_config['max_position_embeddings'],
-    'rope_theta': base_config['rope_theta'],
-    'n_layers': base_config['num_hidden_layers']
-}
-# ==============================================================================
-# FIX: Proper EOS token handling
-# ==============================================================================
 print("\n🔤 Recreating tokenizer...")
 tokenizer = Tokenizer.from_pretrained("gpt2")
-# GPT-2's actual EOS token is "<|endoftext|>"
 eos_token = "<|endoftext|>"
 eos_token_id = tokenizer.token_to_id(eos_token)
 if eos_token_id is None:
-    # Fallback to adding it
     tokenizer.add_special_tokens([eos_token])
     eos_token_id = tokenizer.token_to_id(eos_token)
-# Add custom tokens
 custom_tokens = ["<think>", "<think/>"]
 for token in custom_tokens:
     if tokenizer.token_to_id(token) is None:
         tokenizer.add_special_tokens([token])
 tokenizer.no_padding()
 tokenizer.enable_truncation(max_length=base_config['max_position_embeddings'])
 print(f"✅ Tokenizer ready (vocab size: {tokenizer.get_vocab_size()})")
 print(f"   EOS token: '{eos_token}' (ID: {eos_token_id})")
-# Verify EOS token is valid
 if eos_token_id is None:
-    raise ValueError("❌ Failed to set EOS token ID! Check tokenizer setup.")
-# Load all models from registry
 print("\n" + "="*80)
 print("📦 LOADING MODELS".center(80))
 print("="*80)
 available_models = {}
 dummy_input = tf.zeros((1, 1), dtype=tf.int32)
 for display_name, repo_id, weights_filename, config_filename in MODEL_REGISTRY:
     try:
         print(f"\n⏳ Loading: {display_name}")
         print(f"   Repo: {repo_id}")
         print(f"   Weights: {weights_filename}")
-        # Download weights
         weights_path = hf_hub_download(repo_id=repo_id, filename=weights_filename)
-        # Load custom config if specified (for pruned models)
         if config_filename:
             print(f"   Config: {config_filename}")
             custom_config_path = hf_hub_download(repo_id=repo_id, filename=config_filename)
             with open(custom_config_path, 'r') as f:
                 model_config = json.load(f)
-            print(f"   📐 Custom architecture: {model_config['n_heads']} heads, {int(model_config['d_model'] * model_config['ff_mult'])} FFN dim")
         else:
             model_config = base_model_config.copy()
-        # Create model with appropriate config
         model = SAM1Model(**model_config)
         model(dummy_input)
         model.load_weights(weights_path)
         model.trainable = False
-        # Create backend
         backend = KerasBackend(model, display_name, display_name)
         available_models[display_name] = backend
-        # Print stats
         print(f"   ✅ Loaded successfully!")
         print(f"   📊 Parameters: {format_param_count(backend.total_params)}")
-        print(f"   📊 Attention heads: {backend.n_heads}")
-        print(f"   📊 FFN dimension: {backend.ff_dim}")
     except Exception as e:
         print(f"   ⚠️  Failed to load: {e}")
-        print(f"   Skipping {display_name}...")
 if not available_models:
-    raise RuntimeError("❌ No models loaded! Check your MODEL_REGISTRY configuration.")
 print(f"\n✅ Successfully loaded {len(available_models)} model(s)")
-print(f"   Device: {'GPU' if len(tf.config.list_physical_devices('GPU')) > 0 else 'CPU'}")
 current_backend = list(available_models.values())[0]
-# Global stop flag
 stop_generation = threading.Event()
-# ==============================================================================
-# FIX: Improved generation function with better stop handling
-# ==============================================================================
 def generate_response_stream(prompt, temperature=0.7, backend=None, max_tokens=256):
-    """Generate response and yield tokens one by one for streaming."""
     global stop_generation
     stop_generation.clear()
     if backend is None:
         backend = current_backend
-    # Encode prompt
     encoded_prompt = tokenizer.encode(prompt)
     input_ids = [i for i in encoded_prompt.ids if i != eos_token_id]
     generated = input_ids.copy()
     current_text = ""
     in_thinking = False
-    # Get max_len from the backend's model config
     max_len = backend.model.cfg['max_len']
-    # Track timing
     start_time = time.time()
     tokens_generated = 0
-    # *** DYNAMIC DECODE BATCHING: Adjust based on generation speed ***
     decode_buffer = []
-    decode_every = 2  # Start conservative
     last_speed_check = start_time
-    # Generate tokens
     for step in range(max_tokens):
-        # *** FIX: Check stop flag FIRST before any processing ***
         if stop_generation.is_set():
-            print(f"   🛑 Stop requested at token {tokens_generated}")
-            # Calculate final speed
             elapsed = time.time() - start_time
             final_speed = tokens_generated / elapsed if elapsed > 0 else 0
-            yield "", False, -1, final_speed, True  # Added stopped flag
             return
         current_input = generated[-max_len:]
-        # Get logits from selected backend
         next_token_logits = backend.predict(current_input)
-        # *** DYNAMIC BATCHING: Adjust decode_every based on speed ***
-        # Check speed every 10 tokens after warmup
         if tokens_generated > 5 and tokens_generated % 10 == 0:
             current_time = time.time()
             elapsed_since_check = current_time - last_speed_check
             if elapsed_since_check > 0:
                 recent_speed = 10 / elapsed_since_check
-                # Adaptive batching: faster models can batch more
                 if recent_speed > 25:
-                    decode_every = 8  # Very fast (Nano)
                 elif recent_speed > 15:
-                    decode_every = 5  # Fast (Mini)
                 elif recent_speed > 8:
-                    decode_every = 3  # Medium (Fast)
                 else:
-                    decode_every = 2  # Slow (Large)
                 last_speed_check = current_time
         if temperature > 0:
             next_token_logits = next_token_logits / temperature
             top_k = 5
@@ -612,524 +755,26 @@ def generate_response_stream(prompt, temperature=0.7, backend=None, max_tokens=2
             next_token = top_k_indices[np.random.choice(top_k, p=probs)]
         else:
             next_token = np.argmax(next_token_logits)
-        # *** FIX: Check for EOS token IMMEDIATELY and break ***
         if next_token == eos_token_id:
-            print(f"   🛑 EOS token detected at position {tokens_generated}")
             break
         generated.append(int(next_token))
         decode_buffer.append(int(next_token))
         tokens_generated += 1
-        # Decode in batches for better performance
-        should_decode = (len(decode_buffer) >= decode_every or
-                        step == max_tokens - 1)
         if should_decode:
             new_text = tokenizer.decode(generated[len(input_ids):])
             if len(new_text) > len(current_text):
                 new_chunk = new_text[len(current_text):]
                 current_text = new_text
                 if "<think>" in new_chunk:
                     in_thinking = True
                 elif "</think>" in new_chunk or "<think/>" in new_chunk:
                     in_thinking = False
-                # Calculate tokens/sec
                 elapsed = time.time() - start_time
                 tokens_per_sec = tokens_generated / elapsed if elapsed > 0 else 0
                 yield new_chunk, in_thinking, tokens_per_sec, tokens_per_sec, False
                 decode_buffer = []
-    # Final stats
     elapsed = time.time() - start_time
     final_tokens_per_sec = tokens_generated / elapsed if elapsed > 0 else 0
     yield "", False, final_tokens_per_sec, final_tokens_per_sec, False
-# ==============================================================================
-# Gradio Interface
-# ==============================================================================
-if __name__ == "__main__":
-    import gradio as gr
-    custom_css = """
-    .chat-container {
-        height: 600px;
-        overflow-y: auto;
-        padding: 20px;
-        background: #ffffff;
-    }
-    .user-message {
-        background: #f7f7f8;
-        padding: 16px;
-        margin: 12px 0;
-        border-radius: 8px;
-    }
-    .assistant-message {
-        background: #ffffff;
-        padding: 16px;
-        margin: 12px 0;
-        border-radius: 8px;
-        border-left: 3px solid #10a37f;
-    }
-    .message-content {
-        color: #353740;
-        line-height: 1.6;
-        font-size: 15px;
-    }
-    .message-header {
-        font-weight: 600;
-        margin-bottom: 8px;
-        color: #353740;
-        font-size: 14px;
-    }
-    .thinking-content {
-        color: #6b7280;
-        font-style: italic;
-        border-left: 3px solid #d1d5db;
-        padding-left: 12px;
-        margin: 8px 0;
-        background: #f9fafb;
-        padding: 8px 12px;
-        border-radius: 4px;
-    }
-    .input-row {
-        background: #ffffff;
-        padding: 12px;
-        border-radius: 8px;
-        margin-top: 12px;
-        border: 1px solid #e5e7eb;
-    }
-    .gradio-container {
-        max-width: 900px !important;
-        margin: auto !important;
-    }
-    .announcement-banner {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        padding: 20px 28px;
-        border-radius: 12px;
-        margin-bottom: 20px;
-        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
-        text-align: center;
-        font-size: 16px;
-        font-weight: 500;
-        animation: slideIn 0.5s ease-out;
-        line-height: 1.6;
-    }
-    @keyframes slideIn {
-        from {
-            opacity: 0;
-            transform: translateY(-20px);
-        }
-        to {
-            opacity: 1;
-            transform: translateY(0);
-        }
-    }
-    .announcement-banner strong {
-        font-weight: 700;
-        font-size: 18px;
-    }
-    .settings-panel {
-        background: #f9fafb;
-        padding: 16px;
-        border-radius: 8px;
-        margin-bottom: 12px;
-        border: 1px solid #e5e7eb;
-    }
-    .model-info {
-        background: #f0f9ff;
-        border: 1px solid #bae6fd;
-        padding: 12px;
-        border-radius: 8px;
-        margin-top: 8px;
-        font-size: 13px;
-        font-family: monospace;
-        white-space: pre-line;
-    }
-    .speed-indicator {
-        background: #dcfce7;
-        border: 1px solid #86efac;
-        padding: 8px 12px;
-        border-radius: 6px;
-        margin-top: 8px;
-        font-size: 14px;
-        font-weight: 600;
-        color: #166534;
-        text-align: center;
-    }
-    /* Circular Send Button */
-    .send-btn-wrapper {
-        display: flex;
-        gap: 8px;
-        align-items: center;
-    }
-    .circular-btn {
-        width: 48px !important;
-        height: 48px !important;
-        min-width: 48px !important;
-        border-radius: 50% !important;
-        padding: 0 !important;
-        display: flex !important;
-        align-items: center !important;
-        justify-content: center !important;
-        font-size: 20px !important;
-        box-shadow: 0 2px 8px rgba(0,0,0,0.15) !important;
-        transition: all 0.2s ease !important;
-    }
-    .circular-btn:hover:not(:disabled) {
-        transform: scale(1.05) !important;
-        box-shadow: 0 4px 12px rgba(0,0,0,0.2) !important;
-    }
-    .circular-btn:active:not(:disabled) {
-        transform: scale(0.95) !important;
-    }
-    .send-btn {
-        background: linear-gradient(135deg, #10a37f 0%, #0d8c6c 100%) !important;
-        border: none !important;
-    }
-    .stop-btn {
-        background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important;
-        border: none !important;
-    }
-    .circular-btn:disabled {
-        opacity: 0.4 !important;
-        cursor: not-allowed !important;
-        transform: none !important;
-    }
-    """
-    def format_message_html(role, content, show_thinking=True, show_raw=False):
-        """Format a single message as HTML."""
-        role_class = "user-message" if role == "user" else "assistant-message"
-        role_name = "You" if role == "user" else "SAM-X-1"
-        thinking = ""
-        answer = ""
-        if "<think>" in content:
-            parts = content.split("<think>", 1)
-            before_think = parts[0].strip()
-            if len(parts) > 1:
-                after_think = parts[1]
-                if "</think>" in after_think:
-                    think_parts = after_think.split("</think>", 1)
-                    thinking = think_parts[0].strip()
-                    answer = (before_think + " " + think_parts[1]).strip()
-                elif "<think/>" in after_think:
-                    think_parts = after_think.split("<think/>", 1)
-                    thinking = think_parts[0].strip()
-                    answer = (before_think + " " + think_parts[1]).strip()
-                else:
-                    thinking = after_think.strip()
-                    answer = before_think
-            else:
-                answer = before_think
-        else:
-            answer = content
-        html = f'<div class="{role_class}">'
-        html += f'<div class="message-header">{role_name}</div>'
-        html += f'<div class="message-content">'
-        if thinking and show_thinking:
-            html += f'<div class="thinking-content">💭 {thinking}</div>'
-        if answer:
-            html += f'<div>{answer}</div>'
-        # Add raw response debug view
-        if show_raw and role == "assistant":
-            # Escape HTML and show special tokens
-            raw_content = content.replace("<", "&lt;").replace(">", "&gt;")
-            raw_content = raw_content.replace("&lt;endoftext&gt;", '<span style="background: #fef3c7; color: #92400e; padding: 2px 6px; border-radius: 3px; font-weight: bold;">⚠️ &lt;endoftext&gt;</span>')
-            raw_content = raw_content.replace("&lt;think&gt;", '<span style="background: #dbeafe; color: #1e40af; padding: 2px 6px; border-radius: 3px;">🤔 &lt;think&gt;</span>')
-            raw_content = raw_content.replace("&lt;/think&gt;", '<span style="background: #dbeafe; color: #1e40af; padding: 2px 6px; border-radius: 3px;">✅ &lt;/think&gt;</span>')
-            raw_content = raw_content.replace("&lt;think/&gt;", '<span style="background: #dbeafe; color: #1e40af; padding: 2px 6px; border-radius: 3px;">✅ &lt;think/&gt;</span>')
-            html += f'''
-            <div style="margin-top: 12px; padding: 12px; background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 6px; font-family: monospace; font-size: 12px; color: #374151;">
-                <div style="font-weight: 600; margin-bottom: 6px; color: #6b7280;">🔍 Raw Response (Debug):</div>
-                <div style="white-space: pre-wrap; word-break: break-all;">{raw_content}</div>
-            </div>
-            '''
-        html += '</div></div>'
-        return html
-    def render_history(history, show_thinking, show_raw=False):
-        """Render chat history as HTML."""
-        html = ""
-        for msg in history:
-            html += format_message_html(msg["role"], msg["content"], show_thinking, show_raw)
-        return html
-    # ==============================================================================
-    # Simplified send_message handler with separate buttons
-    # ==============================================================================
-    def send_message(message, show_thinking, temperature, model_choice, max_tokens, show_raw):
-        global stop_generation
-        stop_generation.clear()
-        if not message.strip():
-            return "", "", "⚡ 0.0 tok/s", gr.update(interactive=True), gr.update(interactive=False)
-        # Disable send button, enable stop button
-        yield "", "", "⚡ Generating...", gr.update(interactive=False), gr.update(interactive=True)
-        # Switch backend based on selection (or auto-select)
-        if model_choice == "🤖 Auto (Smart Selection)":
-            backend = select_model_auto(message, available_models)
-            model_name = backend.get_name()
-            yield "", f"<div style='background: #dbeafe; padding: 12px; border-radius: 8px; margin: 8px 0; border-left: 3px solid #3b82f6;'><strong>🤖 Auto-selected:</strong> {model_name}</div>", "⚡ Generating...", gr.update(interactive=False), gr.update(interactive=True)
-        else:
-            backend = available_models[model_choice]
-        # Create single-turn history
-        history = [{"role": "user", "content": message}]
-        # Show user message immediately
-        yield "", render_history(history, show_thinking, show_raw), "⚡ Generating...", gr.update(interactive=False), gr.update(interactive=True)
-        # Generate prompt (single turn, no history)
-        prompt = f"User: {message}\nSam:   <think>"
-        # Start assistant message
-        history.append({"role": "assistant", "content": "<think>"})
-        # Stream response
-        last_tokens_per_sec = 0
-        was_stopped = False
-        for chunk_data in generate_response_stream(prompt, temperature, backend, max_tokens):
-            if len(chunk_data) == 5:  # New format with stopped flag
-                new_chunk, in_thinking, tokens_per_sec, avg_tokens_per_sec, stopped = chunk_data
-                if stopped:
-                    was_stopped = True
-                    print("   ✅ Generation stopped successfully")
-                    break
-                if new_chunk:  # Only update if there's actual content
-                    history[-1]["content"] += new_chunk
-                last_tokens_per_sec = avg_tokens_per_sec
-                # Update UI on every chunk - keep stop button enabled
-                speed_text = f"⚡ {tokens_per_sec:.1f} tok/s"
-                yield "", render_history(history, show_thinking, show_raw), speed_text, gr.update(interactive=False), gr.update(interactive=True)
-        # Final yield - enable send button, disable stop button
-        if was_stopped:
-            final_speed = f"🛑 Stopped at {last_tokens_per_sec:.1f} tok/s"
-        else:
-            final_speed = f"✅ {last_tokens_per_sec:.1f} tok/s (avg)"
-        print(f"   📊 Final speed: {final_speed}")
-        yield "", render_history(history, show_thinking, show_raw), final_speed, gr.update(interactive=True), gr.update(interactive=False)
-    def stop_generation_handler():
-        """Handle stop button click."""
-        global stop_generation
-        print("   🛑 Stop button clicked - setting stop flag")
-        stop_generation.set()
-        return "🛑 Stopping...", gr.update(interactive=False), gr.update(interactive=False)
-    def clear_chat():
-        """Clear chat and reset UI."""
-        return "", "⚡ Ready", gr.update(interactive=True), gr.update(interactive=False)
-    def update_raw_view(history, show_thinking, show_raw):
-        """Update the chat display when raw checkbox is toggled."""
-        return render_history(history, show_thinking, show_raw)
-    # Create Gradio interface
-    with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="slate")) as demo:
-        # Announcement Banner
-        gr.HTML("""
-        <div class="announcement-banner">
-            🎉 <strong>SAM-X-1 V2.2 IS HERE!</strong> 🚀<br>
-            ✨ <strong>NEW:</strong> Auto Model Selection - Let AI pick the perfect model for your task!<br>
-            ⚡ <strong>NEW:</strong> Dynamic Batching - Up to 4x faster UI updates on Nano & Mini!<br>
-            🔥 <strong>TRY IT NOW:</strong> Use "Auto" mode and watch it intelligently choose Nano for speed or Large for complexity!<br>
-            💎 <strong>Nano & Mini models are BLAZING fast</strong> - Perfect for quick questions and coding tasks!
-        </div>
-        """)
-        gr.Markdown("# 🤖 SAM-X-1 Fast Chat (No History)")
-        # Settings panel
-        with gr.Accordion("⚙️ Settings", open=False):
-            with gr.Row():
-                model_selector = gr.Dropdown(
-                    choices=["🤖 Auto (Smart Selection)"] + list(available_models.keys()),
-                    value="🤖 Auto (Smart Selection)",
-                    label="Model Selection",
-                    info="Auto picks the best model for your prompt"
-                )
-                max_tokens_slider = gr.Slider(
-                    minimum=64,
-                    maximum=512,
-                    value=256,
-                    step=64,
-                    label="Max Tokens",
-                    info="Lower = Faster generation"
-                )
-            with gr.Row():
-                temperature_slider = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.7,
-                    step=0.1,
-                    label="Temperature",
-                    info="Higher = more creative, Lower = more focused"
-                )
-            with gr.Row():
-                show_thinking_checkbox = gr.Checkbox(
-                    label="Show Thinking Process",
-                    value=True,
-                    info="Display model's reasoning"
-                )
-                show_raw_checkbox = gr.Checkbox(
-                    label="Show Raw Response (Debug)",
-                    value=False,
-                    info="See all special tokens including <|endoftext|>"
-                )
-        # Speed indicator
-        speed_display = gr.Textbox(
-            label="Generation Speed",
-            value="⚡ Ready",
-            interactive=False,
-            elem_classes=["speed-indicator"]
-        )
-        # Chat display
-        chat_html = gr.HTML(value="", elem_classes=["chat-container"])
-        # Input area with separate send and stop buttons
-        with gr.Row(elem_classes=["input-row"]):
-            msg_input = gr.Textbox(
-                placeholder="Ask me anything...",
-                show_label=False,
-                container=False,
-                scale=8
-            )
-            with gr.Column(scale=1, min_width=120):
-                with gr.Row():
-                    send_btn = gr.Button("▶", variant="primary", elem_classes=["circular-btn", "send-btn"], interactive=True)
-                    stop_btn = gr.Button("⏹", variant="stop", elem_classes=["circular-btn", "stop-btn"], interactive=False)
-        with gr.Row():
-            clear_btn = gr.Button("🗑️ Clear", size="sm")
-        gr.Markdown("""
-        ### 🎯 Try These Examples with Auto Mode:
-        **Simple (→ Nano):**
-        - "Hi, how are you?"
-        - "What is Python?"
-        - "Tell me a joke"
-        **Medium (→ Mini):**
-        - "Write a short story about a robot"
-        - "Summarize the benefits of exercise"
-        - "Create a simple Python function to sort a list"
-        **Complex (→ Fast):**
-        - "Analyze the differences between procedural and object-oriented programming"
-        - "Compare and contrast democracy and authoritarianism"
-        - "Explain how neural networks learn with backpropagation"
-        **Very Hard (→ Large):**
-        - "Prove why the Pythagorean theorem works using geometric reasoning"
-        - "Derive the formula for compound interest step by step"
-        - "Explain the philosophical implications of Gödel's incompleteness theorems"
-        ### 💡 Speed Optimization Tips:
-        - **Auto mode (Default)**: Balances speed and quality automatically
-        - **Manual Nano**: 30-40 tok/s - Best for simple questions
-        - **Manual Mini**: 20-30 tok/s - Great for most tasks
-        - **Manual Fast**: 15-20 tok/s - Good for complex reasoning
-        - **Manual Large**: 10-15 tok/s - Use only for hardest problems
-        - **Temperature = 0**: Greedy decoding (fastest, deterministic)
-        - **Lower max tokens**: Stop generation earlier
-        ### ⚡ V2.2 Features:
-        - ✅ **Smart Auto-Selection** - AI picks the right model for your prompt
-        - ✅ **Dynamic Decode Batching** - Adjusts from 2-8 tokens based on speed
-        - ✅ **Faster UI Updates** - Nano batches 8 tokens = 4x smoother experience
-        - ✅ **Complexity Analysis** - Examines length, keywords, code, multi-step questions
-        - ✅ **Instant Stop Button** - Interrupt generation with no delay
-        - ✅ **Debug Mode** - See all special tokens in raw view
-        ### 🎯 Expected Speed (2vCPU):
-        - **Nano**: 30-40 tok/s (batch: 8) ⚡⚡
-        - **Mini**: 20-30 tok/s (batch: 5) 🚀
-        - **Fast**: 15-20 tok/s (batch: 3) ⚡
-        - **Large**: 10-15 tok/s (batch: 2) 💎
-        ### 🚀 What's New:
-        - **V2.2**: Auto model selection + Dynamic batching
-        - **V2.1**: Separate Send/Stop buttons + EOS fixes + Debug view
-        - **V2.0**: Multi-model support + Speed optimizations
-        """)
-        # Event handlers
-        send_outputs = [msg_input, chat_html, speed_display, send_btn, stop_btn]
-        # Send button
-        send_btn.click(
-            send_message,
-            inputs=[msg_input, show_thinking_checkbox, temperature_slider, model_selector, max_tokens_slider, show_raw_checkbox],
-            outputs=send_outputs
-        )
-        msg_input.submit(
-            send_message,
-            inputs=[msg_input, show_thinking_checkbox, temperature_slider, model_selector, max_tokens_slider, show_raw_checkbox],
-            outputs=send_outputs
-        )
-        # Stop button
-        stop_btn.click(
-            stop_generation_handler,
-            outputs=[speed_display, send_btn, stop_btn]
-        )
-        clear_btn.click(
-            clear_chat,
-            outputs=[chat_html, speed_display, send_btn, stop_btn]
-        )
-    demo.launch(debug=True, share=True)

 from abc import ABC, abstractmethod
 import time
 import threading
+import hashlib
+import sqlite3
+from datetime import datetime, timedelta
+import pytz
 # ==============================================================================
 # Performance Optimizations for CPU
 # ==============================================================================
 tf.config.threading.set_inter_op_parallelism_threads(1)
 tf.config.threading.set_intra_op_parallelism_threads(2)
 tf.config.optimizer.set_jit(True)
 tf.config.run_functions_eagerly(False)
 os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
 os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
+# Australian timezone
+AUSTRALIA_TZ = pytz.timezone('Australia/Sydney')
+# ==============================================================================
+# Database Setup
+# ==============================================================================
+def init_database():
+    """Initialize SQLite database for users and subscriptions."""
+    conn = sqlite3.connect('sam_users.db', check_same_thread=False)
+    c = conn.cursor()
+    # Users table
+    c.execute('''CREATE TABLE IF NOT EXISTS users
+                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                  username TEXT UNIQUE NOT NULL,
+                  password_hash TEXT NOT NULL,
+                  email TEXT,
+                  plan TEXT DEFAULT 'free',
+                  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                  is_admin BOOLEAN DEFAULT 0,
+                  rate_limit_start TIMESTAMP,
+                  messages_used_nano INTEGER DEFAULT 0,
+                  messages_used_mini INTEGER DEFAULT 0,
+                  messages_used_fast INTEGER DEFAULT 0,
+                  messages_used_large INTEGER DEFAULT 0)''')
+    # Upgrade requests table
+    c.execute('''CREATE TABLE IF NOT EXISTS upgrade_requests
+                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                  user_id INTEGER,
+                  requested_plan TEXT,
+                  reason TEXT,
+                  status TEXT DEFAULT 'pending',
+                  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                  FOREIGN KEY (user_id) REFERENCES users(id))''')
+    # Usage tracking
+    c.execute('''CREATE TABLE IF NOT EXISTS usage_logs
+                 (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                  user_id INTEGER,
+                  tokens_used INTEGER,
+                  model_used TEXT,
+                  timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                  FOREIGN KEY (user_id) REFERENCES users(id))''')
+    # Create admin account if not exists
+    admin_pass = hashlib.sha256("admin123".encode()).hexdigest()
+    try:
+        c.execute("INSERT INTO users (username, password_hash, email, plan, is_admin) VALUES (?, ?, ?, ?, ?)",
+                  ("admin", admin_pass, "[email protected]", "pro", 1))
+        conn.commit()
+        print("✅ Admin account created (username: admin, password: admin123)")
+    except sqlite3.IntegrityError:
+        print("✅ Admin account already exists")
+    conn.commit()
+    return conn
+# Global database connection
+db_conn = init_database()
+db_lock = threading.Lock()
+# Plan limits with 3-hour rolling window
+PLAN_LIMITS = {
+    'free': {
+        'nano_messages': -1,
+        'mini_messages': -1,
+        'fast_messages': 10,
+        'large_messages': 8,
+        'can_choose_model': False,
+        'max_tokens': 256,
+        'reset_hours': 3
+    },
+    'plus': {
+        'nano_messages': -1,
+        'mini_messages': -1,
+        'fast_messages': -1,
+        'large_messages': 20,
+        'can_choose_model': True,
+        'max_tokens': 384,
+        'reset_hours': 3
+    },
+    'pro': {
+        'nano_messages': -1,
+        'mini_messages': -1,
+        'fast_messages': -1,
+        'large_messages': -1,
+        'can_choose_model': True,
+        'max_tokens': 512,
+        'reset_hours': 3
+    }
+}
+def get_model_type(model_name):
+    """Get model type from model name."""
+    if 'Nano' in model_name:
+        return 'nano'
+    elif 'Mini' in model_name:
+        return 'mini'
+    elif 'Fast' in model_name:
+        return 'fast'
+    elif 'Large' in model_name:
+        return 'large'
+    return 'nano'
 # ==============================================================================
+# User Management Functions
+# ==============================================================================
+def hash_password(password):
+    return hashlib.sha256(password.encode()).hexdigest()
+def create_user(username, password, email=""):
+    with db_lock:
+        try:
+            c = db_conn.cursor()
+            now = datetime.now(AUSTRALIA_TZ).isoformat()
+            c.execute("INSERT INTO users (username, password_hash, email, rate_limit_start) VALUES (?, ?, ?, ?)",
+                      (username, hash_password(password), email, now))
+            db_conn.commit()
+            return True, "Account created successfully!"
+        except sqlite3.IntegrityError:
+            return False, "Username already exists!"
+def authenticate_user(username, password):
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("SELECT id, password_hash, plan, is_admin FROM users WHERE username = ?", (username,))
+        result = c.fetchone()
+        if result and result[1] == hash_password(password):
+            return True, {"id": result[0], "username": username, "plan": result[2], "is_admin": bool(result[3])}
+        return False, None
+def check_and_reset_limits(user_id):
+    """Check if 3-hour window has passed and reset limits if needed."""
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("SELECT rate_limit_start, plan FROM users WHERE id = ?", (user_id,))
+        result = c.fetchone()
+        if not result:
+            return
+        rate_limit_start_str, plan = result
+        reset_hours = PLAN_LIMITS[plan]['reset_hours']
+        if rate_limit_start_str:
+            rate_limit_start = datetime.fromisoformat(rate_limit_start_str)
+            now = datetime.now(AUSTRALIA_TZ)
+            if now - rate_limit_start >= timedelta(hours=reset_hours):
+                new_start = now.isoformat()
+                c.execute("""UPDATE users
+                            SET rate_limit_start = ?,
+                                messages_used_nano = 0,
+                                messages_used_mini = 0,
+                                messages_used_fast = 0,
+                                messages_used_large = 0
+                            WHERE id = ?""", (new_start, user_id))
+                db_conn.commit()
+def get_user_limits_info(user_id):
+    """Get user's current usage and limits with reset time."""
+    check_and_reset_limits(user_id)
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""SELECT plan, rate_limit_start,
+                            messages_used_nano, messages_used_mini,
+                            messages_used_fast, messages_used_large
+                     FROM users WHERE id = ?""", (user_id,))
+        result = c.fetchone()
+        if not result:
+            return None
+        plan, rate_limit_start_str, nano_used, mini_used, fast_used, large_used = result
+        limits = PLAN_LIMITS[plan]
+        if rate_limit_start_str:
+            rate_limit_start = datetime.fromisoformat(rate_limit_start_str)
+            reset_time = rate_limit_start + timedelta(hours=limits['reset_hours'])
+            now = datetime.now(AUSTRALIA_TZ)
+            time_until_reset = reset_time - now
+            hours, remainder = divmod(int(time_until_reset.total_seconds()), 3600)
+            minutes, seconds = divmod(remainder, 60)
+            reset_str = f"{hours}h {minutes}m"
+        else:
+            reset_str = "N/A"
+        return {
+            'plan': plan,
+            'nano_used': nano_used,
+            'mini_used': mini_used,
+            'fast_used': fast_used,
+            'large_used': large_used,
+            'nano_limit': limits['nano_messages'],
+            'mini_limit': limits['mini_messages'],
+            'fast_limit': limits['fast_messages'],
+            'large_limit': limits['large_messages'],
+            'can_choose_model': limits['can_choose_model'],
+            'max_tokens': limits['max_tokens'],
+            'reset_in': reset_str
+        }
+def can_use_model(user_id, model_name):
+    """Check if user can use a specific model."""
+    info = get_user_limits_info(user_id)
+    if not info:
+        return False, "User not found"
+    model_type = get_model_type(model_name)
+    used_key = f"{model_type}_used"
+    limit_key = f"{model_type}_limit"
+    used = info[used_key]
+    limit = info[limit_key]
+    if limit == -1:
+        return True, "OK"
+    if used >= limit:
+        return False, f"Limit reached for {model_type.upper()} model ({used}/{limit}). Resets in {info['reset_in']}"
+    return True, "OK"
+def increment_model_usage(user_id, model_name):
+    """Increment usage counter for a model."""
+    model_type = get_model_type(model_name)
+    column = f"messages_used_{model_type}"
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute(f"UPDATE users SET {column} = {column} + 1 WHERE id = ?", (user_id,))
+        db_conn.commit()
+def get_available_models_for_user(user_id):
+    """Get list of models user can currently use."""
+    info = get_user_limits_info(user_id)
+    if not info:
+        return []
+    available = []
+    for model_type in ['nano', 'mini', 'fast', 'large']:
+        used = info[f'{model_type}_used']
+        limit = info[f'{model_type}_limit']
+        if limit == -1 or used < limit:
+            for model_name in available_models.keys():
+                if get_model_type(model_name) == model_type:
+                    available.append(model_name)
+                    break
+    return available
+def log_usage(user_id, tokens, model):
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("INSERT INTO usage_logs (user_id, tokens_used, model_used) VALUES (?, ?, ?)",
+                  (user_id, tokens, model))
+        db_conn.commit()
+def request_upgrade(user_id, plan, reason):
+    with db_lock:
+        try:
+            c = db_conn.cursor()
+            c.execute("INSERT INTO upgrade_requests (user_id, requested_plan, reason) VALUES (?, ?, ?)",
+                      (user_id, plan, reason))
+            db_conn.commit()
+            return True, "Upgrade request submitted! Admin will review soon."
+        except Exception as e:
+            return False, f"Error: {str(e)}"
+def get_all_users():
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""SELECT id, username, email, plan, created_at, is_admin,
+                            messages_used_nano, messages_used_mini,
+                            messages_used_fast, messages_used_large,
+                            rate_limit_start
+                     FROM users ORDER BY created_at DESC""")
+        return c.fetchall()
+def get_pending_requests():
+    with db_lock:
+        c = db_conn.cursor()
+        c.execute("""SELECT r.id, u.username, r.requested_plan, r.reason, r.created_at
+                     FROM upgrade_requests r
+                     JOIN users u ON r.user_id = u.id
+                     WHERE r.status = 'pending'
+                     ORDER BY r.created_at DESC""")
+        return c.fetchall()
+def update_user_plan(username, new_plan):
+    with db_lock:
+        try:
+            c = db_conn.cursor()
+            now = datetime.now(AUSTRALIA_TZ).isoformat()
+            c.execute("""UPDATE users
+                        SET plan = ?,
+                            rate_limit_start = ?,
+                            messages_used_nano = 0,
+                            messages_used_mini = 0,
+                            messages_used_fast = 0,
+                            messages_used_large = 0
+                        WHERE username = ?""", (new_plan, now, username))
+            db_conn.commit()
+            return True, f"User {username} upgraded to {new_plan}!"
+        except Exception as e:
+            return False, f"Error: {str(e)}"
+def approve_request(request_id):
+    with db_lock:
+        try:
+            c = db_conn.cursor()
+            c.execute("SELECT user_id, requested_plan FROM upgrade_requests WHERE id = ?", (request_id,))
+            result = c.fetchone()
+            if result:
+                user_id, plan = result
+                now = datetime.now(AUSTRALIA_TZ).isoformat()
+                c.execute("""UPDATE users
+                            SET plan = ?,
+                                rate_limit_start = ?,
+                                messages_used_nano = 0,
+                                messages_used_mini = 0,
+                                messages_used_fast = 0,
+                                messages_used_large = 0
+                            WHERE id = ?""", (plan, now, user_id))
+                c.execute("UPDATE upgrade_requests SET status = 'approved' WHERE id = ?", (request_id,))
+                db_conn.commit()
+                return True, "Request approved!"
+            return False, "Request not found"
+        except Exception as e:
+            return False, f"Error: {str(e)}"
+def deny_request(request_id):
+    with db_lock:
+        try:
+            c = db_conn.cursor()
+            c.execute("UPDATE upgrade_requests SET status = 'denied' WHERE id = ?", (request_id,))
+            db_conn.commit()
+            return True, "Request denied"
+        except Exception as e:
+            return False, f"Error: {str(e)}"
+# ==============================================================================
+# Model Architecture
 # ==============================================================================
 @keras.saving.register_keras_serializable()
 class RotaryEmbedding(keras.layers.Layer):
             t = tf.range(self.max_len, dtype=tf.float32)
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
             self.cos_cached = tf.constant(tf.cos(emb), dtype=tf.float32)
             self.sin_cached = tf.constant(tf.sin(emb), dtype=tf.float32)
             self.built_cache = True
         dtype = q.dtype
         cos = tf.cast(self.cos_cached[:seq_len, :], dtype)[None, None, :, :]
         sin = tf.cast(self.sin_cached[:seq_len, :], dtype)[None, None, :, :]
         q_rotated = (q * cos) + (self.rotate_half(q) * sin)
         k_rotated = (k * cos) + (self.rotate_half(k) * sin)
         return q_rotated, k_rotated
     def get_config(self):
         config.update({"dim": self.dim, "max_len": self.max_len, "theta": self.theta})
         return config
 @keras.saving.register_keras_serializable()
 class RMSNorm(keras.layers.Layer):
     def __init__(self, epsilon=1e-5, **kwargs):
         config.update({"epsilon": self.epsilon})
         return config
 @keras.saving.register_keras_serializable()
 class TransformerBlock(keras.layers.Layer):
     def __init__(self, d_model, n_heads, ff_dim, dropout, max_len, rope_theta, layer_idx=0, **kwargs):
         self.rope_theta = rope_theta
         self.head_dim = d_model // n_heads
         self.layer_idx = layer_idx
         self.pre_attn_norm = RMSNorm()
         self.pre_ffn_norm = RMSNorm()
         self.q_proj = keras.layers.Dense(d_model, use_bias=False, name="q_proj")
         self.k_proj = keras.layers.Dense(d_model, use_bias=False, name="k_proj")
         self.v_proj = keras.layers.Dense(d_model, use_bias=False, name="v_proj")
         self.out_proj = keras.layers.Dense(d_model, use_bias=False, name="o_proj")
         self.rope = RotaryEmbedding(self.head_dim, max_len=max_len, theta=rope_theta)
         self.gate_proj = keras.layers.Dense(ff_dim, use_bias=False, name="gate_proj")
         self.up_proj = keras.layers.Dense(ff_dim, use_bias=False, name="up_proj")
         self.down_proj = keras.layers.Dense(d_model, use_bias=False, name="down_proj")
         self.dropout = keras.layers.Dropout(dropout)
     def call(self, x, training=None):
         B, T, D = tf.shape(x)[0], tf.shape(x)[1], self.d_model
         dtype = x.dtype
         res = x
         y = self.pre_attn_norm(x)
         q = tf.transpose(tf.reshape(self.q_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         k = tf.transpose(tf.reshape(self.k_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         v = tf.transpose(tf.reshape(self.v_proj(y), [B, T, self.n_heads, self.head_dim]), [0, 2, 1, 3])
         q, k = self.rope(q, k)
         scores = tf.matmul(q, k, transpose_b=True) / tf.sqrt(tf.cast(self.head_dim, dtype))
+        mask = tf.where(tf.linalg.band_part(tf.ones([T, T], dtype=dtype), -1, 0) == 0, tf.constant(-1e9, dtype=dtype), tf.constant(0.0, dtype=dtype))
         scores += mask
         attn = tf.matmul(tf.nn.softmax(scores, axis=-1), v)
         attn = tf.reshape(tf.transpose(attn, [0, 2, 1, 3]), [B, T, D])
         x = res + self.dropout(self.out_proj(attn), training=training)
         res = x
         y = self.pre_ffn_norm(x)
         ffn = self.down_proj(keras.activations.silu(self.gate_proj(y)) * self.up_proj(y))
         return res + self.dropout(ffn, training=training)
     def get_config(self):
         config = super().get_config()
+        config.update({"d_model": self.d_model, "n_heads": self.n_heads, "ff_dim": self.ff_dim, "dropout": self.dropout_rate, "max_len": self.max_len, "rope_theta":
+# PART 2 - Continue from Part 1
+        self.rope_theta, "layer_idx": self.layer_idx})
+        return config
 @keras.saving.register_keras_serializable()
 class SAM1Model(keras.Model):
             self.cfg = kwargs
         else:
             self.cfg = kwargs.get('cfg', kwargs)
         self.embed = keras.layers.Embedding(self.cfg['vocab_size'], self.cfg['d_model'], name="embed_tokens")
         ff_dim = int(self.cfg['d_model'] * self.cfg['ff_mult'])
+        block_args = {'d_model': self.cfg['d_model'], 'n_heads': self.cfg['n_heads'], 'ff_dim': ff_dim, 'dropout': self.cfg['dropout'], 'max_len': self.cfg['max_len'], 'rope_theta': self.cfg['rope_theta']}
         self.blocks = []
         for i in range(self.cfg['n_layers']):
             block = TransformerBlock(name=f"block_{i}", layer_idx=i, **block_args)
             self.blocks.append(block)
         self.norm = RMSNorm(name="final_norm")
         self.lm_head = keras.layers.Dense(self.cfg['vocab_size'], use_bias=False, name="lm_head")
     def call(self, input_ids, training=None):
         x = self.embed(input_ids)
         for block in self.blocks:
             x = block(x, training=training)
         return self.lm_head(self.norm(x))
     def get_config(self):
         base_config['config'] = self.cfg
         return base_config
 def count_parameters(model):
     total_params = 0
     non_zero_params = 0
     for weight in model.weights:
         w = weight.numpy()
         total_params += w.size
         non_zero_params += np.count_nonzero(w)
     return total_params, non_zero_params
 def format_param_count(count):
     if count >= 1e9:
         return f"{count/1e9:.2f}B"
     elif count >= 1e6:
     else:
         return str(count)
 class ModelBackend(ABC):
     @abstractmethod
     def predict(self, input_ids):
         pass
     @abstractmethod
     def get_name(self):
         pass
     @abstractmethod
     def get_info(self):
         pass
 class KerasBackend(ModelBackend):
     def __init__(self, model, name, display_name):
         self.model = model
         self.name = name
         self.display_name = display_name
+        @tf.function(input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)], jit_compile=True)
         def fast_predict(inputs):
             return model(inputs, training=False)
         self.fast_predict = fast_predict
         print(f"   🔥 Warming up {display_name}...")
         dummy = tf.constant([[1, 2, 3]], dtype=tf.int32)
         _ = self.fast_predict(dummy)
         print(f"   ✅ Compilation complete!")
         total, non_zero = count_parameters(model)
         self.total_params = total
         self.non_zero_params = non_zero
         self.sparsity = (1 - non_zero / total) * 100 if total > 0 else 0
         self.n_heads = model.cfg.get('n_heads', 0)
         self.ff_dim = int(model.cfg.get('d_model', 0) * model.cfg.get('ff_mult', 0))
             info += f"  Sparsity: {self.sparsity:.1f}%\n"
         return info
 MODEL_REGISTRY = [
     ("SAM-X-1-Large", "Smilyai-labs/Sam-1x-instruct", "ckpt.weights.h5", None),
     ("SAM-X-1-Fast ⚡ (BETA)", "Smilyai-labs/Sam-X-1-fast", "sam1_fast.weights.h5", "sam1_fast_config.json"),
     ("SAM-X-1-Nano ⚡⚡", "Smilyai-labs/Sam-X-1-Nano", "sam1_nano_finetuned.weights.h5", "sam1_nano_finetuned_config.json"),
 ]
 def estimate_prompt_complexity(prompt):
     prompt_lower = prompt.lower()
     complexity_score = 0
     word_count = len(prompt.split())
     if word_count > 100:
         complexity_score += 3
         complexity_score += 2
     elif word_count > 20:
         complexity_score += 1
+    hard_keywords = ['analyze', 'explain', 'compare', 'evaluate', 'prove', 'derive', 'calculate', 'solve', 'reason', 'why', 'how does', 'complex', 'algorithm', 'mathematics', 'philosophy', 'theory', 'logic', 'detailed', 'comprehensive', 'thorough', 'in-depth']
     for keyword in hard_keywords:
         if keyword in prompt_lower:
             complexity_score += 2
+    medium_keywords = ['write', 'create', 'generate', 'summarize', 'describe', 'list', 'what is', 'tell me', 'explain briefly']
     for keyword in medium_keywords:
         if keyword in prompt_lower:
             complexity_score += 1
     if any(word in prompt_lower for word in ['code', 'function', 'program', 'debug', 'implement']):
         complexity_score += 2
     if any(word in prompt_lower for word in ['first', 'then', 'next', 'finally', 'step']):
         complexity_score += 1
     question_marks = prompt.count('?')
     if question_marks > 1:
         complexity_score += 1
     return complexity_score
+def select_model_auto(prompt, available_models_dict, user_available_models):
     complexity = estimate_prompt_complexity(prompt)
+    accessible = {k: v for k, v in available_models_dict.items() if k in user_available_models}
+    if not accessible:
+        return None
     if complexity <= 2:
         preferred = "SAM-X-1-Nano ⚡⚡"
         fallback_order = ["SAM-X-1-Mini 🚀 (ADVANCED!)", "SAM-X-1-Fast ⚡ (BETA)", "SAM-X-1-Large"]
     else:
         preferred = "SAM-X-1-Large"
         fallback_order = ["SAM-X-1-Fast ⚡ (BETA)", "SAM-X-1-Mini 🚀 (ADVANCED!)", "SAM-X-1-Nano ⚡⚡"]
+    if preferred in accessible:
+        return accessible[preferred]
     for model_name in fallback_order:
+        if model_name in accessible:
+            return accessible[model_name]
+    return list(accessible.values())[0]
 CONFIG_TOKENIZER_REPO_ID = "Smilyai-labs/Sam-1-large-it-0002"
 print("="*80)
 print("🤖 SAM-X-1 Multi-Model Chat Interface".center(80))
 print("="*80)
 print(f"\n📦 Downloading config/tokenizer from: {CONFIG_TOKENIZER_REPO_ID}")
 config_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="config.json")
 tokenizer_path = hf_hub_download(repo_id=CONFIG_TOKENIZER_REPO_ID, filename="tokenizer.json")
 with open(config_path, 'r') as f:
     base_config = json.load(f)
 print(f"✅ Base config loaded")
+base_model_config = {'vocab_size': base_config['vocab_size'], 'd_model': base_config['hidden_size'], 'n_heads': base_config['num_attention_heads'], 'ff_mult': base_config['intermediate_size'] / base_config['hidden_size'], 'dropout': base_config.get('dropout', 0.0), 'max_len': base_config['max_position_embeddings'], 'rope_theta': base_config['rope_theta'], 'n_layers': base_config['num_hidden_layers']}
 print("\n🔤 Recreating tokenizer...")
 tokenizer = Tokenizer.from_pretrained("gpt2")
 eos_token = "<|endoftext|>"
 eos_token_id = tokenizer.token_to_id(eos_token)
 if eos_token_id is None:
     tokenizer.add_special_tokens([eos_token])
     eos_token_id = tokenizer.token_to_id(eos_token)
 custom_tokens = ["<think>", "<think/>"]
 for token in custom_tokens:
     if tokenizer.token_to_id(token) is None:
         tokenizer.add_special_tokens([token])
 tokenizer.no_padding()
 tokenizer.enable_truncation(max_length=base_config['max_position_embeddings'])
 print(f"✅ Tokenizer ready (vocab size: {tokenizer.get_vocab_size()})")
 print(f"   EOS token: '{eos_token}' (ID: {eos_token_id})")
 if eos_token_id is None:
+    raise ValueError("❌ Failed to set EOS token ID!")
 print("\n" + "="*80)
 print("📦 LOADING MODELS".center(80))
 print("="*80)
 available_models = {}
 dummy_input = tf.zeros((1, 1), dtype=tf.int32)
 for display_name, repo_id, weights_filename, config_filename in MODEL_REGISTRY:
     try:
         print(f"\n⏳ Loading: {display_name}")
         print(f"   Repo: {repo_id}")
         print(f"   Weights: {weights_filename}")
         weights_path = hf_hub_download(repo_id=repo_id, filename=weights_filename)
         if config_filename:
             print(f"   Config: {config_filename}")
             custom_config_path = hf_hub_download(repo_id=repo_id, filename=config_filename)
             with open(custom_config_path, 'r') as f:
                 model_config = json.load(f)
+            print(f"   📐 Custom architecture: {model_config['n_heads']} heads")
         else:
             model_config = base_model_config.copy()
         model = SAM1Model(**model_config)
         model(dummy_input)
         model.load_weights(weights_path)
         model.trainable = False
         backend = KerasBackend(model, display_name, display_name)
         available_models[display_name] = backend
         print(f"   ✅ Loaded successfully!")
         print(f"   📊 Parameters: {format_param_count(backend.total_params)}")
     except Exception as e:
         print(f"   ⚠️  Failed to load: {e}")
 if not available_models:
+    raise RuntimeError("❌ No models loaded!")
 print(f"\n✅ Successfully loaded {len(available_models)} model(s)")
 current_backend = list(available_models.values())[0]
 stop_generation = threading.Event()
 def generate_response_stream(prompt, temperature=0.7, backend=None, max_tokens=256):
     global stop_generation
     stop_generation.clear()
     if backend is None:
         backend = current_backend
     encoded_prompt = tokenizer.encode(prompt)
     input_ids = [i for i in encoded_prompt.ids if i != eos_token_id]
     generated = input_ids.copy()
     current_text = ""
     in_thinking = False
     max_len = backend.model.cfg['max_len']
     start_time = time.time()
     tokens_generated = 0
     decode_buffer = []
+    decode_every = 2
     last_speed_check = start_time
     for step in range(max_tokens):
         if stop_generation.is_set():
             elapsed = time.time() - start_time
             final_speed = tokens_generated / elapsed if elapsed > 0 else 0
+            yield "", False, -1, final_speed, True
             return
         current_input = generated[-max_len:]
         next_token_logits = backend.predict(current_input)
         if tokens_generated > 5 and tokens_generated % 10 == 0:
             current_time = time.time()
             elapsed_since_check = current_time - last_speed_check
             if elapsed_since_check > 0:
                 recent_speed = 10 / elapsed_since_check
                 if recent_speed > 25:
+                    decode_every = 8
                 elif recent_speed > 15:
+                    decode_every = 5
                 elif recent_speed > 8:
+                    decode_every = 3
                 else:
+                    decode_every = 2
                 last_speed_check = current_time
         if temperature > 0:
             next_token_logits = next_token_logits / temperature
             top_k = 5
             next_token = top_k_indices[np.random.choice(top_k, p=probs)]
         else:
             next_token = np.argmax(next_token_logits)
         if next_token == eos_token_id:
             break
         generated.append(int(next_token))
         decode_buffer.append(int(next_token))
         tokens_generated += 1
+        should_decode = (len(decode_buffer) >= decode_every or step == max_tokens - 1)
         if should_decode:
             new_text = tokenizer.decode(generated[len(input_ids):])
             if len(new_text) > len(current_text):
                 new_chunk = new_text[len(current_text):]
                 current_text = new_text
                 if "<think>" in new_chunk:
                     in_thinking = True
                 elif "</think>" in new_chunk or "<think/>" in new_chunk:
                     in_thinking = False
                 elapsed = time.time() - start_time
                 tokens_per_sec = tokens_generated / elapsed if elapsed > 0 else 0
                 yield new_chunk, in_thinking, tokens_per_sec, tokens_per_sec, False
                 decode_buffer = []
     elapsed = time.time() - start_time
     final_tokens_per_sec = tokens_generated / elapsed if elapsed > 0 else 0
     yield "", False, final_tokens_per_sec, final_tokens_per_sec, False