Spaces:

Bc-AI
/

Worker-2

Sleeping

App Files Files Community

Bc-AI commited on Nov 4

Commit

120f320

verified ·

1 Parent(s): 1055547

Update app.py

Browse files

Files changed (1) hide show

app.py +398 -191

app.py CHANGED Viewed

@@ -1,6 +1,9 @@
 """
-SAM-Z-1 Distributed Worker Node v4.0
-Optimized for distributed gen/decode pipeline
 """
 from fastapi import FastAPI, HTTPException
@@ -14,10 +17,56 @@ import os
 from tokenizers import Tokenizer
 import numpy as np
 import time
-from typing import List, Optional
 import asyncio
-app = FastAPI(title="SAM-Z-1 Distributed Worker", version="4.0.0")
 # ============================================================================
 # Model Architecture
@@ -201,24 +250,19 @@ class SAM1Model(keras.Model):
         return base_config
 # ============================================================================
-# Global State
 # ============================================================================
-model = None
-tokenizer = None
-config = None
-eos_token_id = None
-fast_forward = None
-MODEL_REPO = "Smilyai-labs/Sam-Z-1-tensorflow"
-CACHE_DIR = "./model_cache"
-# Stats
 worker_stats = {
     "total_requests": 0,
     "total_tokens": 0,
     "decode_requests": 0,
-    "uptime_start": time.time()
 }
 # ============================================================================
@@ -234,6 +278,7 @@ class GenerateRequest(BaseModel):
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
 class ChatMessage(BaseModel):
     role: str
@@ -248,12 +293,70 @@ class ChatRequest(BaseModel):
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
 class DecodeRequest(BaseModel):
     token_ids: List[int]
 class BatchDecodeRequest(BaseModel):
     batches: List[List[int]]
 # ============================================================================
 # Generation Functions
@@ -266,11 +369,22 @@ def generate_tokens(
     top_k: int = 40,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1,
-    return_token_ids: bool = False
 ):
-    """Core generation - yields (token_id, token_text or None)"""
-    global model, tokenizer, config, eos_token_id, fast_forward
     input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
     if len(input_ids) == 0:
@@ -349,26 +463,29 @@ def format_chat_prompt(messages: List[ChatMessage]) -> str:
 @app.get("/", response_class=HTMLResponse)
 async def status_page():
-    """Worker status page"""
-    return """
 <!DOCTYPE html>
 <html>
 <head>
-    <title>SAM-Z-1 Worker Node</title>
     <style>
-        * { margin: 0; padding: 0; box-sizing: border-box; }
-        body {
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #1a1f3a 0%, #0a0e27 100%);
             color: #00bfff;
             padding: 20px;
             min-height: 100vh;
-        }
-        .container {
-            max-width: 900px;
-            margin: 0 auto;
-        }
-        .header {
             text-align: center;
             padding: 30px;
             background: rgba(0, 191, 255, 0.1);
@@ -376,93 +493,77 @@ async def status_page():
             border-radius: 10px;
             margin-bottom: 30px;
             box-shadow: 0 0 20px rgba(0, 191, 255, 0.3);
-        }
-        .header h1 {
             font-size: 2.5em;
             text-transform: uppercase;
             letter-spacing: 3px;
             animation: glow 2s ease-in-out infinite alternate;
-        }
-        @keyframes glow {
-            from { text-shadow: 0 0 10px #00bfff; }
-            to { text-shadow: 0 0 20px #00bfff, 0 0 30px #00bfff; }
-        }
-        .badge {
             display: inline-block;
             padding: 5px 15px;
             border-radius: 15px;
             font-size: 0.9em;
-            margin-top: 10px;
-        }
-        .badge-ready {
             background: rgba(0, 255, 136, 0.2);
             border: 1px solid #00ff88;
             color: #00ff88;
-        }
-        .badge-loading {
             background: rgba(255, 165, 0, 0.2);
             border: 1px solid #ffa500;
             color: #ffa500;
-        }
-        .stats-grid {
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
-        }
-        .stat-card {
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
             text-align: center;
-        }
-        .stat-label {
-            font-size: 0.8em;
-            opacity: 0.7;
-            text-transform: uppercase;
-            margin-bottom: 10px;
-        }
-        .stat-value {
-            font-size: 2em;
-            font-weight: bold;
-        }
-        .features {
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
-        }
-        .features h3 {
-            margin-bottom: 15px;
-        }
-        .feature-list {
-            list-style: none;
-            padding: 0;
-        }
-        .feature-list li {
             padding: 10px;
             margin: 5px 0;
             background: rgba(0, 191, 255, 0.1);
             border-radius: 5px;
-        }
-        .feature-list li:before {
-            content: "⚡ ";
-            color: #00ff88;
-        }
-        .timestamp {
-            text-align: center;
-            margin-top: 20px;
-            opacity: 0.5;
-        }
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚙️ WORKER NODE ⚙️</h1>
-            <div>SAM-Z-1 Distributed Worker v4.0</div>
-            <div class="badge" id="status-badge">CHECKING STATUS...</div>
         </div>
         <div class="stats-grid" id="stats">
@@ -484,14 +585,23 @@ async def status_page():
             </div>
         </div>
         <div class="features">
             <h3>🚀 CAPABILITIES</h3>
             <ul class="feature-list">
-                <li>Full Text Generation</li>
-                <li>Token-Only Mode (for distributed pipeline)</li>
-                <li>High-Speed Batch Decoding</li>
-                <li>Chat Completion</li>
-                <li>Streaming & Non-Streaming</li>
             </ul>
         </div>
@@ -499,21 +609,8 @@ async def status_page():
     </div>
     <script>
-        async function updateStats() {
-            try {
-                const response = await fetch('/health');
-                const data = await response.json();
-                const badge = document.getElementById('status-badge');
-                if (data.model_loaded) {
-                    badge.textContent = '✅ READY FOR INFERENCE';
-                    badge.className = 'badge badge-ready';
-                } else {
-                    badge.textContent = '⏳ LOADING MODEL...';
-                    badge.className = 'badge badge-loading';
-                }
-                // Fetch stats
                 const statsRes = await fetch('/stats');
                 const stats = await statsRes.json();
@@ -525,16 +622,15 @@ async def status_page():
                 const h = Math.floor(uptime / 3600);
                 const m = Math.floor((uptime % 3600) / 60);
                 const s = uptime % 60;
-                document.getElementById('uptime').textContent = `${h}h ${m}m ${s}s`;
                 document.getElementById('timestamp').textContent =
-                    `Last update: ${new Date().toLocaleTimeString()}`;
-            } catch (e) {
                 console.error('Failed to update stats:', e);
-            }
-        }
-        // Update every second
         setInterval(updateStats, 1000);
         updateStats();
     </script>
@@ -549,8 +645,38 @@ async def status_page():
 @app.get("/health")
 async def health():
     return {
-        "status": "healthy" if model is not None else "loading",
-        "model_loaded": model is not None
     }
 @app.get("/stats")
@@ -561,17 +687,16 @@ async def stats():
         "total_tokens": worker_stats["total_tokens"],
         "decode_requests": worker_stats["decode_requests"],
         "uptime": uptime,
-        "tokens_per_second": worker_stats["total_tokens"] / uptime if uptime > 0 else 0
     }
 @app.post("/decode")
 async def decode(request: DecodeRequest):
-    """Fast single decode"""
-    if tokenizer is None:
-        raise HTTPException(status_code=503, detail="Tokenizer not loaded")
     try:
         worker_stats["decode_requests"] += 1
         text = tokenizer.decode(request.token_ids)
         return {"text": text}
     except Exception as e:
@@ -579,12 +704,10 @@ async def decode(request: DecodeRequest):
 @app.post("/decode/batch")
 async def batch_decode(request: BatchDecodeRequest):
-    """Optimized batch decoding for distributed pipeline"""
-    if tokenizer is None:
-        raise HTTPException(status_code=503, detail="Tokenizer not loaded")
     try:
         worker_stats["decode_requests"] += len(request.batches)
         results = [tokenizer.decode(batch) for batch in request.batches]
         return {"texts": results}
     except Exception as e:
@@ -592,9 +715,15 @@ async def batch_decode(request: BatchDecodeRequest):
 @app.post("/generate")
 async def generate(request: GenerateRequest):
-    """Generate text"""
-    if model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
     worker_stats["total_requests"] += 1
     start_time = time.time()
@@ -612,7 +741,8 @@ async def generate(request: GenerateRequest):
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
-                    return_token_ids=request.return_token_ids
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
@@ -626,7 +756,7 @@ async def generate(request: GenerateRequest):
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
-                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
@@ -645,7 +775,8 @@ async def generate(request: GenerateRequest):
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
-                return_token_ids=request.return_token_ids
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
@@ -658,7 +789,8 @@ async def generate(request: GenerateRequest):
                 "text": generated_text,
                 "tokens": token_count,
                 "time": elapsed,
-                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0
             }
         except Exception as e:
@@ -666,9 +798,15 @@ async def generate(request: GenerateRequest):
 @app.post("/chat")
 async def chat(request: ChatRequest):
-    """Chat completion"""
-    if model is None:
-        raise HTTPException(status_code=503, detail="Model not loaded")
     worker_stats["total_requests"] += 1
     prompt = format_chat_prompt(request.messages)
@@ -687,7 +825,8 @@ async def chat(request: ChatRequest):
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
-                    return_token_ids=request.return_token_ids
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
@@ -706,7 +845,7 @@ async def chat(request: ChatRequest):
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
-                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
@@ -725,7 +864,8 @@ async def chat(request: ChatRequest):
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
-                return_token_ids=request.return_token_ids
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
@@ -746,7 +886,8 @@ async def chat(request: ChatRequest):
                 },
                 "tokens": token_count,
                 "time": elapsed,
-                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0
             }
         except Exception as e:
@@ -756,86 +897,152 @@ async def chat(request: ChatRequest):
 # Model Loading
 # ============================================================================
-@app.on_event("startup")
-async def load_model():
-    global model, tokenizer, config, eos_token_id, fast_forward
-    print("🚀 Loading SAM-Z-1 Model...")
     try:
-        config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
-        try:
-            weights_path = hf_hub_download(MODEL_REPO, "ckpt.weights.h5", cache_dir=CACHE_DIR)
-            print("✅ Found checkpoint weights")
-            use_checkpoint = True
-        except:
-            print("⚠️  Checkpoint not found, using model.keras")
-            model_path = hf_hub_download(MODEL_REPO, "model.keras", cache_dir=CACHE_DIR)
-            use_checkpoint = False
-        with open(config_path, 'r') as f:
-            config = json.load(f)
-        print(f"📦 Config loaded: {config['num_hidden_layers']} layers")
-        print("📦 Creating tokenizer...")
-        from transformers import AutoTokenizer
-        hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
-        custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
-        hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
-        os.makedirs("./temp_tokenizer", exist_ok=True)
-        hf_tokenizer.save_pretrained("./temp_tokenizer")
-        tokenizer = Tokenizer.from_file("./temp_tokenizer/tokenizer.json")
-        eos_token_id = config.get('eos_token_id', 50256)
-        print(f"✅ Tokenizer ready: vocab size {tokenizer.get_vocab_size()}")
-        print("🔄 Loading model...")
-        if use_checkpoint:
-            model_config = {
-                'vocab_size': config['vocab_size'],
-                'd_model': config['hidden_size'],
-                'n_layers': config['num_hidden_layers'],
-                'n_heads': config['num_attention_heads'],
-                'ff_mult': config['intermediate_size'] / config['hidden_size'],
-                'max_len': config['max_position_embeddings'],
-                'dropout': 0.1,
-                'rope_theta': config['rope_theta']
-            }
-            model = SAM1Model(config=model_config)
-            dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
-            _ = model(dummy_input, training=False)
-            print(f"✅ Architecture built: {model.count_params():,} parameters")
-            model.load_weights(weights_path)
-            print("✅ Weights loaded!")
         else:
-            model = keras.models.load_model(model_path, compile=False)
-            print("✅ Model loaded!")
-        @tf.function(reduce_retracing=True)
-        def optimized_forward(input_tensor):
-            return model(input_tensor, training=False)
-        fast_forward = optimized_forward
-        print("✅ SAM-Z-1 Distributed Worker ready! 🚀")
-        print("🔥 Features enabled:")
-        print("   - Full text generation")
-        print("   - Token-only mode (distributed pipeline)")
-        print("   - Batch decoding optimization")
-        print("   - Streaming support")
     except Exception as e:
-        print(f"❌ Failed to load model: {e}")
         import traceback
         traceback.print_exc()
         raise

 """
+SAM-Z-1 Distributed Worker Node v5.0
+- Supports BOTH old SAM-Z-1 AND 4 new SAM-X-1 models
+- Different tokenizers and vocabularies per model family
+- Auto version detection
+- Backward compatible with v4 head nodes
 """
 from fastapi import FastAPI, HTTPException
 from tokenizers import Tokenizer
 import numpy as np
 import time
+from typing import List, Optional, Dict
 import asyncio
+app = FastAPI(title="SAM-Z-1 Distributed Worker", version="5.0.0")
+# ============================================================================
+# Configuration - ALL 5 MODELS
+# ============================================================================
+MODEL_REGISTRY = {
+    # Original SAM-Z-1 (keep this!)
+    "SAM-Z-1": {
+        "repo": "Smilyai-labs/Sam-Z-1-tensorflow",
+        "weights": "ckpt.weights.h5",
+        "config": "config.json",
+        "tokenizer_repo": "Smilyai-labs/Sam-Z-1-tensorflow",
+        "family": "sam-z"  # Different tokenizer family
+    },
+    # New SAM-X-1 family (different tokenizer!)
+    "SAM-X-1-Large": {
+        "repo": "Smilyai-labs/Sam-1x-instruct",
+        "weights": "ckpt.weights.h5",
+        "config": None,
+        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
+        "family": "sam-x"
+    },
+    "SAM-X-1-Fast": {
+        "repo": "Smilyai-labs/Sam-X-1-fast",
+        "weights": "sam1_fast_finetuned.weights.h5",
+        "config": "sam1_fast_finetuned_config.json",
+        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
+        "family": "sam-x"
+    },
+    "SAM-X-1-Mini": {
+        "repo": "Smilyai-labs/Sam-X-1-Mini",
+        "weights": "sam1_mini_finetuned.weights.h5",
+        "config": "sam1_mini_finetuned_config.json",
+        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
+        "family": "sam-x"
+    },
+    "SAM-X-1-Nano": {
+        "repo": "Smilyai-labs/Sam-X-1-Nano",
+        "weights": "sam1_nano_finetuned.weights.h5",
+        "config": "sam1_nano_finetuned_config.json",
+        "tokenizer_repo": "Smilyai-labs/Sam-1-large-it-0002",
+        "family": "sam-x"
+    }
+}
+CACHE_DIR = "./model_cache"
 # ============================================================================
 # Model Architecture
         return base_config
 # ============================================================================
+# Global State - Separate tokenizers per family!
 # ============================================================================
+loaded_models = {}  # Dict[model_name, (model, fast_forward, config, tokenizer, eos_token_id)]
+tokenizer_cache = {}  # Dict[family, (tokenizer, eos_token_id)]
+current_model = None
 worker_stats = {
     "total_requests": 0,
     "total_tokens": 0,
     "decode_requests": 0,
+    "uptime_start": time.time(),
+    "model_usage": {}
 }
 # ============================================================================
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
+    model: Optional[str] = None
 class ChatMessage(BaseModel):
     role: str
     repetition_penalty: float = 1.1
     stream: bool = False
     return_token_ids: bool = False
+    model: Optional[str] = None
 class DecodeRequest(BaseModel):
     token_ids: List[int]
+    model: Optional[str] = None  # Need to know which tokenizer to use!
 class BatchDecodeRequest(BaseModel):
     batches: List[List[int]]
+    model: Optional[str] = None
+# ============================================================================
+# Tokenizer Management
+# ============================================================================
+async def load_tokenizer(family: str, repo: str) -> tuple:
+    """Load tokenizer for a model family"""
+    if family in tokenizer_cache:
+        return tokenizer_cache[family]
+    print(f"   🔤 Loading tokenizer for {family} family from {repo}...")
+    try:
+        from transformers import AutoTokenizer
+        hf_tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        custom_tokens = ["<|im_start|>", "<|im_end|>", "<think>", "<think/>"]
+        hf_tokenizer.add_special_tokens({"additional_special_tokens": custom_tokens})
+        os.makedirs(f"./temp_tokenizer_{family}", exist_ok=True)
+        hf_tokenizer.save_pretrained(f"./temp_tokenizer_{family}")
+        tokenizer = Tokenizer.from_file(f"./temp_tokenizer_{family}/tokenizer.json")
+        eos_token = "<|endoftext|>"
+        eos_token_id = tokenizer.token_to_id(eos_token)
+        if eos_token_id is None:
+            tokenizer.add_special_tokens([eos_token])
+            eos_token_id = tokenizer.token_to_id(eos_token)
+        tokenizer_cache[family] = (tokenizer, eos_token_id)
+        print(f"   ✅ Tokenizer ready (vocab size: {tokenizer.get_vocab_size()}, EOS: {eos_token_id})")
+        return tokenizer, eos_token_id
+    except Exception as e:
+        print(f"   ⚠️ Tokenizer load failed: {e}")
+        raise
+def get_tokenizer_for_model(model_name: str):
+    """Get the correct tokenizer for a model"""
+    if not model_name or model_name not in loaded_models:
+        model_name = current_model
+    if model_name in loaded_models:
+        _, _, _, tokenizer, eos_id = loaded_models[model_name]
+        return tokenizer, eos_id
+    # Fallback to first available
+    if loaded_models:
+        first_model = list(loaded_models.keys())[0]
+        _, _, _, tokenizer, eos_id = loaded_models[first_model]
+        return tokenizer, eos_id
+    raise HTTPException(status_code=503, detail="No models loaded")
 # ============================================================================
 # Generation Functions
     top_k: int = 40,
     top_p: float = 0.9,
     repetition_penalty: float = 1.1,
+    return_token_ids: bool = False,
+    model_name: Optional[str] = None
 ):
+    """Core generation with correct tokenizer per model"""
+    global loaded_models, current_model
+    # Select model
+    if model_name and model_name in loaded_models:
+        model, fast_forward, config, tokenizer, eos_token_id = loaded_models[model_name]
+    elif current_model:
+        model, fast_forward, config, tokenizer, eos_token_id = loaded_models[current_model]
+    else:
+        model_name = list(loaded_models.keys())[0]
+        model, fast_forward, config, tokenizer, eos_token_id = loaded_models[model_name]
+    # Encode with model's tokenizer
     input_ids = [i for i in tokenizer.encode(prompt).ids if i != eos_token_id]
     if len(input_ids) == 0:
 @app.get("/", response_class=HTMLResponse)
 async def status_page():
+    models_html = ""
+    for model_name in loaded_models.keys():
+        usage = worker_stats["model_usage"].get(model_name, 0)
+        _, _, _, tokenizer, _ = loaded_models[model_name]
+        vocab_size = tokenizer.get_vocab_size()
+        models_html += f'<li><strong>{model_name}</strong> - Vocab: {vocab_size} - Used: {usage}x</li>'
+    return f"""
 <!DOCTYPE html>
 <html>
 <head>
+    <title>SAM Worker v5.0 - Multi-Model</title>
     <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{
             font-family: 'Courier New', monospace;
             background: linear-gradient(135deg, #1a1f3a 0%, #0a0e27 100%);
             color: #00bfff;
             padding: 20px;
             min-height: 100vh;
+        }}
+        .container {{ max-width: 1000px; margin: 0 auto; }}
+        .header {{
             text-align: center;
             padding: 30px;
             background: rgba(0, 191, 255, 0.1);
             border-radius: 10px;
             margin-bottom: 30px;
             box-shadow: 0 0 20px rgba(0, 191, 255, 0.3);
+        }}
+        .header h1 {{
             font-size: 2.5em;
             text-transform: uppercase;
             letter-spacing: 3px;
             animation: glow 2s ease-in-out infinite alternate;
+        }}
+        @keyframes glow {{
+            from {{ text-shadow: 0 0 10px #00bfff; }}
+            to {{ text-shadow: 0 0 20px #00bfff, 0 0 30px #00bfff; }}
+        }}
+        .badge {{
             display: inline-block;
             padding: 5px 15px;
             border-radius: 15px;
             font-size: 0.9em;
+            margin: 5px;
+        }}
+        .badge-v5 {{
             background: rgba(0, 255, 136, 0.2);
             border: 1px solid #00ff88;
             color: #00ff88;
+        }}
+        .badge-multi {{
             background: rgba(255, 165, 0, 0.2);
             border: 1px solid #ffa500;
             color: #ffa500;
+        }}
+        .stats-grid {{
             display: grid;
             grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
             gap: 20px;
             margin-bottom: 30px;
+        }}
+        .stat-card {{
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
             text-align: center;
+        }}
+        .stat-label {{ font-size: 0.8em; opacity: 0.7; text-transform: uppercase; margin-bottom: 10px; }}
+        .stat-value {{ font-size: 2em; font-weight: bold; }}
+        .features {{
             background: rgba(0, 191, 255, 0.05);
             border: 1px solid #00bfff;
             border-radius: 8px;
             padding: 20px;
+            margin-bottom: 20px;
+        }}
+        .features h3 {{ margin-bottom: 15px; }}
+        .feature-list {{ list-style: none; padding: 0; }}
+        .feature-list li {{
             padding: 10px;
             margin: 5px 0;
             background: rgba(0, 191, 255, 0.1);
             border-radius: 5px;
+            border-left: 3px solid #00ff88;
+        }}
+        .timestamp {{ text-align: center; margin-top: 20px; opacity: 0.5; }}
     </style>
 </head>
 <body>
     <div class="container">
         <div class="header">
             <h1>⚙️ WORKER NODE ⚙️</h1>
+            <div>SAM-Z-1 Distributed Worker v5.0</div>
+            <div>
+                <span class="badge badge-v5">V5 PROTOCOL</span>
+                <span class="badge badge-multi">{len(loaded_models)} MODELS</span>
+            </div>
         </div>
         <div class="stats-grid" id="stats">
             </div>
         </div>
+        <div class="features">
+            <h3>🤖 LOADED MODELS ({len(loaded_models)})</h3>
+            <ul class="feature-list">
+                {models_html}
+            </ul>
+        </div>
         <div class="features">
             <h3>🚀 CAPABILITIES</h3>
             <ul class="feature-list">
+                <li>✅ Original SAM-Z-1 (preserved)</li>
+                <li>✅ 4 new SAM-X-1 models</li>
+                <li>✅ Separate tokenizers per family</li>
+                <li>✅ Multi-model selection</li>
+                <li>✅ Token & batch decoding</li>
+                <li>✅ Streaming support</li>
+                <li>✅ Auto version detection</li>
             </ul>
         </div>
     </div>
     <script>
+        async function updateStats() {{
+            try {{
                 const statsRes = await fetch('/stats');
                 const stats = await statsRes.json();
                 const h = Math.floor(uptime / 3600);
                 const m = Math.floor((uptime % 3600) / 60);
                 const s = uptime % 60;
+                document.getElementById('uptime').textContent = `${{h}}h ${{m}}m ${{s}}s`;
                 document.getElementById('timestamp').textContent =
+                    `Last update: ${{new Date().toLocaleTimeString()}}`;
+            }} catch (e) {{
                 console.error('Failed to update stats:', e);
+            }}
+        }}
         setInterval(updateStats, 1000);
         updateStats();
     </script>
 @app.get("/health")
 async def health():
     return {
+        "status": "healthy" if loaded_models else "loading",
+        "model_loaded": len(loaded_models) > 0,
+        "models_count": len(loaded_models)
+    }
+@app.get("/info")
+async def worker_info():
+    """Worker information for version detection"""
+    return {
+        "version": "v5",
+        "models": list(loaded_models.keys()),
+        "features": [
+            "multi_model",
+            "model_selection",
+            "separate_tokenizers",
+            "token_generation",
+            "batch_decoding",
+            "streaming"
+        ],
+        "model_families": {
+            "sam-z": [m for m, info in MODEL_REGISTRY.items() if info["family"] == "sam-z"],
+            "sam-x": [m for m, info in MODEL_REGISTRY.items() if info["family"] == "sam-x"]
+        }
+    }
+@app.get("/models")
+async def list_models():
+    """List available models"""
+    return {
+        "models": list(loaded_models.keys()),
+        "default": current_model,
+        "count": len(loaded_models)
     }
 @app.get("/stats")
         "total_tokens": worker_stats["total_tokens"],
         "decode_requests": worker_stats["decode_requests"],
         "uptime": uptime,
+        "tokens_per_second": worker_stats["total_tokens"] / uptime if uptime > 0 else 0,
+        "model_usage": worker_stats["model_usage"]
     }
 @app.post("/decode")
 async def decode(request: DecodeRequest):
+    """Fast single decode - uses correct tokenizer"""
     try:
         worker_stats["decode_requests"] += 1
+        tokenizer, _ = get_tokenizer_for_model(request.model)
         text = tokenizer.decode(request.token_ids)
         return {"text": text}
     except Exception as e:
 @app.post("/decode/batch")
 async def batch_decode(request: BatchDecodeRequest):
+    """Optimized batch decoding - uses correct tokenizer"""
     try:
         worker_stats["decode_requests"] += len(request.batches)
+        tokenizer, _ = get_tokenizer_for_model(request.model)
         results = [tokenizer.decode(batch) for batch in request.batches]
         return {"texts": results}
     except Exception as e:
 @app.post("/generate")
 async def generate(request: GenerateRequest):
+    """Generate text with model selection"""
+    if not loaded_models:
+        raise HTTPException(status_code=503, detail="No models loaded")
+    # Track model usage
+    model_name = request.model or current_model
+    if model_name not in worker_stats["model_usage"]:
+        worker_stats["model_usage"][model_name] = 0
+    worker_stats["model_usage"][model_name] += 1
     worker_stats["total_requests"] += 1
     start_time = time.time()
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
+                    return_token_ids=request.return_token_ids,
+                    model_name=request.model
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
+                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed, 'model': model_name})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
+                return_token_ids=request.return_token_ids,
+                model_name=request.model
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
                 "text": generated_text,
                 "tokens": token_count,
                 "time": elapsed,
+                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0,
+                "model": model_name
             }
         except Exception as e:
 @app.post("/chat")
 async def chat(request: ChatRequest):
+    """Chat completion with model selection"""
+    if not loaded_models:
+        raise HTTPException(status_code=503, detail="No models loaded")
+    # Track model usage
+    model_name = request.model or current_model
+    if model_name not in worker_stats["model_usage"]:
+        worker_stats["model_usage"][model_name] = 0
+    worker_stats["model_usage"][model_name] += 1
     worker_stats["total_requests"] += 1
     prompt = format_chat_prompt(request.messages)
                     top_k=request.top_k,
                     top_p=request.top_p,
                     repetition_penalty=request.repetition_penalty,
+                    return_token_ids=request.return_token_ids,
+                    model_name=request.model
                 ):
                     token_count += 1
                     worker_stats["total_tokens"] += 1
                     await asyncio.sleep(0.001)
                 elapsed = time.time() - start_time
+                yield f"data: {json.dumps({'done': True, 'tokens': token_count, 'time': elapsed, 'model': model_name})}\n\n"
             except Exception as e:
                 yield f"data: {json.dumps({'error': str(e)})}\n\n"
                 top_k=request.top_k,
                 top_p=request.top_p,
                 repetition_penalty=request.repetition_penalty,
+                return_token_ids=request.return_token_ids,
+                model_name=request.model
             ):
                 if not request.return_token_ids:
                     generated_text += token_text
                 },
                 "tokens": token_count,
                 "time": elapsed,
+                "tokens_per_second": token_count / elapsed if elapsed > 0 else 0,
+                "model": model_name
             }
         except Exception as e:
 # Model Loading
 # ============================================================================
+async def load_single_model(model_name: str, model_info: dict) -> bool:
+    """Load a single model with its tokenizer"""
+    global loaded_models, current_model
     try:
+        print(f"\n⏳ Loading: {model_name} ({model_info['family']} family)")
+        print(f"   Repo: {model_info['repo']}")
+        print(f"   Weights: {model_info['weights']}")
+        # Load tokenizer for this family
+        tokenizer, eos_token_id = await load_tokenizer(
+            model_info['family'],
+            model_info['tokenizer_repo']
+        )
+        # Load config
+        if model_info['config']:
+            print(f"   Config: {model_info['config']}")
+            config_path = hf_hub_download(
+                repo_id=model_info['repo'],
+                filename=model_info['config'],
+                cache_dir=CACHE_DIR
+            )
+            with open(config_path, 'r') as f:
+                config_raw = json.load(f)
+        else:
+            # Load base config for Large model
+            print(f"   Loading base config from tokenizer repo...")
+            config_path = hf_hub_download(
+                repo_id=model_info['tokenizer_repo'],
+                filename="config.json",
+                cache_dir=CACHE_DIR
+            )
+            with open(config_path, 'r') as f:
+                config_raw = json.load(f)
+        # Convert to model format
+        model_config = {
+            'vocab_size': config_raw['vocab_size'],
+            'd_model': config_raw['hidden_size'],
+            'n_heads': config_raw['num_attention_heads'],
+            'ff_mult': config_raw['intermediate_size'] / config_raw['hidden_size'],
+            'dropout': config_raw.get('dropout', 0.0),
+            'max_len': config_raw['max_position_embeddings'],
+            'rope_theta': config_raw['rope_theta'],
+            'n_layers': config_raw['num_hidden_layers']
+        }
+        # Add for config object
+        model_config['max_position_embeddings'] = config_raw['max_position_embeddings']
+        print(f"   📐 Architecture: {model_config['n_layers']} layers, {model_config['n_heads']} heads")
+        # Load weights
+        weights_path = hf_hub_download(
+            repo_id=model_info['repo'],
+            filename=model_info['weights'],
+            cache_dir=CACHE_DIR
+        )
+        # Build model
+        model = SAM1Model(**model_config)
+        dummy_input = tf.zeros((1, 1), dtype=tf.int32)
+        model(dummy_input)
+        model.load_weights(weights_path)
+        model.trainable = False
+        # Create optimized forward pass
+        @tf.function(
+            input_signature=[tf.TensorSpec(shape=[1, None], dtype=tf.int32)],
+            jit_compile=True,
+            reduce_retracing=True
+        )
+        def fast_predict(inputs):
+            return model(inputs, training=False)
+        # Warm up
+        print(f"   🔥 Warming up...")
+        dummy = tf.constant([[1, 2, 3]], dtype=tf.int32)
+        _ = fast_predict(dummy)
+        # Store model with its tokenizer
+        loaded_models[model_name] = (model, fast_predict, model_config, tokenizer, eos_token_id)
+        # Set as default if first
+        if current_model is None:
+            current_model = model_name
+        # Count parameters
+        total_params = sum(np.prod(w.shape) for w in model.weights)
+        if total_params >= 1e9:
+            param_str = f"{total_params/1e9:.2f}B"
+        elif total_params >= 1e6:
+            param_str = f"{total_params/1e6:.2f}M"
         else:
+            param_str = f"{total_params/1e3:.2f}K"
+        print(f"   ✅ Loaded successfully!")
+        print(f"   📊 Parameters: {param_str}")
+        print(f"   🔤 Tokenizer vocab: {tokenizer.get_vocab_size()}")
+        return True
+    except Exception as e:
+        print(f"   ⚠️  Failed to load {model_name}: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+@app.on_event("startup")
+async def load_models():
+    global loaded_models, current_model
+    print("="*80)
+    print("🚀 SAM-Z-1 Worker Node v5.0 - Multi-Model with Separate Tokenizers".center(80))
+    print("="*80)
+    try:
+        # Load all models
+        print("\n" + "="*80)
+        print("📦 LOADING ALL 5 MODELS".center(80))
+        print("="*80)
+        loaded_count = 0
+        for model_name, model_info in MODEL_REGISTRY.items():
+            success = await load_single_model(model_name, model_info)
+            if success:
+                loaded_count += 1
+        if loaded_count == 0:
+            raise RuntimeError("❌ No models loaded successfully!")
+        print(f"\n{'='*80}")
+        print(f"✅ Successfully loaded {loaded_count}/{len(MODEL_REGISTRY)} models")
+        print(f"📌 Default model: {current_model}")
+        # Show tokenizer families
+        print(f"\n🔤 Tokenizer Families:")
+        print(f"   SAM-Z family: {len([m for m, i in MODEL_REGISTRY.items() if i['family'] == 'sam-z'])} model(s)")
+        print(f"   SAM-X family: {len([m for m, i in MODEL_REGISTRY.items() if i['family'] == 'sam-x'])} model(s)")
+        print(f"\n🚀 Worker ready for inference!")
+        print(f"{'='*80}\n")
     except Exception as e:
+        print(f"\n❌ Failed to initialize worker: {e}")
         import traceback
         traceback.print_exc()
         raise