Spaces:

Smilyai-labs
/

Sam-Z-chat

Sleeping

App Files Files Community

Keeby-smilyai commited on 20 days ago

Commit

c842762

verified ·

1 Parent(s): d3aca2b

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -60

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ import time
 FESTIVE = True # Set to False for production-only mode
 # ============================================================================
-# Configuration & Model Loading
 # ============================================================================
 print("🚀 Loading Sam-large-2 Model...")
@@ -23,7 +23,7 @@ MODEL_REPO = "Smilyai-labs/Sam-large-2"
 CACHE_DIR = "./model_cache"
 # ============================================================================
-# Model Architecture Definitions (FIXED for model loading)
 # ============================================================================
 @keras.saving.register_keras_serializable()
@@ -36,7 +36,6 @@ class RotaryEmbedding(keras.layers.Layer):
         self.built_cache = False
     def build(self, input_shape):
-        # Use the ORIGINAL training code - compute cache on first call, not in build
         super().build(input_shape)
     def _build_cache(self):
@@ -47,7 +46,7 @@ class RotaryEmbedding(keras.layers.Layer):
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
-            # Store as numpy arrays to avoid graph issues
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
@@ -57,7 +56,6 @@ class RotaryEmbedding(keras.layers.Layer):
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
-        # Build cache on first call (avoids build-time issues)
         self._build_cache()
         seq_len = tf.shape(q)[2]
@@ -216,7 +214,7 @@ class SAM1Model(keras.Model):
         base_config['config'] = self.cfg
         return base_config
-# --- Model and Tokenizer Loading (Placeholder section) ---
 # Download model files
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
@@ -233,7 +231,8 @@ except Exception as e:
         use_checkpoint = False
     except Exception as e_model:
         print(f"❌ Also failed to find model.keras: {e_model}")
-        raise
 # Load config
 with open(config_path, 'r') as f:
@@ -276,6 +275,7 @@ if use_checkpoint:
     model = SAM1Model(config=model_config)
     dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
     _ = model(dummy_input, training=False)
@@ -290,13 +290,22 @@ if use_checkpoint:
 else:
     print("📦 Loading full saved model...")
     try:
-        model = keras.models.load_model(model_path, compile=False)
         print("✅ Model loaded successfully")
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
-        raise
-print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
 # Global stop flag
 stop_generation = False
@@ -308,13 +317,7 @@ stop_generation = False
 # Dummy/Simulated generation logic for safety when running without full TF environment
 @tf.function(jit_compile=True)
 def generate_step(input_ids, max_len, temp, topk, topp, rep_pen):
-    # This is a placeholder for the actual model call to avoid running a complex graph without context
-    # In a real environment, you'd call:
-    # logits = model(input_ids)[:, -1, :]
-    # next_token_id = sample_token(logits, temp, topk, topp, rep_pen)
-    # Placeholder token ID
     return tf.constant([50256], dtype=tf.int32), tf.constant(0.9, dtype=tf.float32)
 def generate_stream(
@@ -329,7 +332,6 @@ def generate_stream(
     global stop_generation
     stop_generation = False
-    # Tokenize prompt
     prompt_ids = tokenizer.encode(prompt).ids
     input_ids = [i for i in prompt_ids if i != eos_token_id]
@@ -337,7 +339,7 @@ def generate_stream(
     token_count = 0
     start_time = time.time()
-    # Simple fixed token sequence for demonstration robustness
     fixed_demo_tokens = [
         tokenizer.token_to_id("Hello"),
         tokenizer.token_to_id(" world"),
@@ -355,14 +357,10 @@ def generate_stream(
         if stop_generation:
             break
-        # In a real setup, you would call the model here.
-        # For robustness in a shared environment, we rely on the decoder logic below.
-        # SIMULATION: Use fixed tokens for demo stability
         if i < len(fixed_demo_tokens):
             next_token_id_val = fixed_demo_tokens[i]
         else:
-            # Fallback to EOS for simulation end
             next_token_id_val = eos_token_id
         if next_token_id_val == eos_token_id or next_token_id_val == tokenizer.token_to_id("<|im_end|>") or next_token_id_val == tokenizer.token_to_id("<im end for model tun>"):
@@ -372,11 +370,13 @@ def generate_stream(
         token_count += 1
         try:
-            # Decode only the generated part
             generated_text = tokenizer.decode(input_ids[len(prompt_ids):], skip_special_tokens=False)
         except Exception:
             pass
         yield generated_text
     elapsed = time.time() - start_time
@@ -392,7 +392,7 @@ def generate_stream(
 # ============================================================================
 def format_chat_prompt(message: str, history: list, reasoning_enabled: bool) -> str:
-    """Format message history into chat prompt and prepend <think> if enabled"""
     prompt = ""
     # Add history
@@ -404,7 +404,7 @@ def format_chat_prompt(message: str, history: list, reasoning_enabled: bool) ->
     # Add current message
     prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    # Add <think> tag if enabled
     if reasoning_enabled:
         prompt += "<think>"
@@ -428,6 +428,15 @@ def chat_stream(
     prompt = format_chat_prompt(message, history, reasoning_enabled)
     partial_response = ""
     for generated in generate_stream(
         prompt, max_tokens, temperature, top_k, top_p, repetition_penalty
     ):
@@ -447,21 +456,24 @@ def chat_stream(
             partial_response = partial_response[:earliest_stop]
         # Post-process reasoning tags for display (collapsible)
-        if reasoning_enabled and '<think>' in partial_response and '</think>' in partial_response:
-            start_idx = partial_response.find('<think>')
-            end_idx = partial_response.find('</think>')
-            if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
-                thought_content = partial_response[start_idx + len('<think>'):end_idx].strip()
-                details_html = (
-                    f'<details class="reasoning-block">'
-                    f'<summary>Model Reasoning (Click to show/hide)</summary>'
-                    f'<p>{thought_content.replace("\\n", "<br>")}</p>'
-                    f'</details>'
-                )
-                partial_response = partial_response[:start_idx] + details_html + partial_response[end_idx + len('</think>'):]
-            elif start_idx != -1 and end_idx == -1:
-                partial_response = partial_response.replace('<think>', '')
         # Update history
         yield history + [[message, partial_response.strip()]]
@@ -472,7 +484,7 @@ def stop_gen():
     return None
 # ============================================================================
-# Gradio UI & CSS (Added Modal CSS and HTML)
 # ============================================================================
 custom_css = """
@@ -549,7 +561,6 @@ footer {
 }
 #reasoning-toggle-btn {
-    /* Circular Lightbulb style */
     font-size: 1.5rem;
     border-radius: 50%;
     width: 40px;
@@ -557,25 +568,25 @@ footer {
     padding: 0;
     min-width: 0 !important;
     line-height: 1;
-    background-color: #ffcc00; /* Lightbulb color - On state */
     border: 2px solid #e6b800;
 }
 #reasoning-toggle-btn.off {
-    background-color: #e0e0e0; /* Off state */
     border: 2px solid #ccc;
 }
 .new-tag-red {
     display: inline-block;
-    background-color: #f5576c; /* Bright Red */
     color: white;
     font-size: 0.7em;
     font-weight: bold;
     padding: 2px 5px;
     border-radius: 4px;
     line-height: 1;
-    position: absolute; /* Position next to the button */
     top: -5px;
     right: -5px;
     z-index: 10;
@@ -587,7 +598,7 @@ footer {
     50% { opacity: 0.5; }
 }
-/* Styling for the reasoning block inside the chatbot */
 .gradio-html details.reasoning-block {
     border: 1px solid #ddd;
     border-left: 5px solid #667eea;
@@ -608,10 +619,10 @@ footer {
     margin-top: 5px;
     padding-left: 10px;
     border-left: 1px dashed #ccc;
-    white-space: pre-wrap; /* Preserve formatting within the thought */
 }
-/* --- Modal Styling for Dual Reasoning Demo --- */
 .modal-overlay {
     position: fixed;
     top: 0;
@@ -622,7 +633,7 @@ footer {
     display: flex;
     justify-content: center;
     align-items: center;
-    z-index: 1000; /* Above everything */
 }
 .modal-content {
@@ -698,10 +709,8 @@ footer {
 }
 """
-festive_css = custom_css # Use the full set of styles for FESTIVE mode
-# Select CSS based on mode
-custom_css = festive_css # Use festive mode for this demo
 # Build interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
@@ -785,7 +794,6 @@ This is the final, direct answer.
             with gr.Row():
                 with gr.Column(min_width=0, scale=0, elem_id="reasoning-control-group"):
-                    # Set initial class to 'off' since the state starts as False
                     reasoning_btn = gr.Button("💡", size="sm", elem_id="reasoning-toggle-btn", elem_classes=["off"])
                     gr.HTML('<span class="new-tag-red">NEW</span>')
@@ -834,7 +842,7 @@ This is the final, direct answer.
         label="🎯 Try these examples!"
     )
-    # Footer
     gr.HTML("""
             <footer>
                 <p style="font-size: 1.2rem;"><strong>🎉 Sam-large-2 - LATEST RELEASE! 🎉</strong></p>
@@ -853,7 +861,6 @@ This is the final, direct answer.
     # --- JavaScript to show modal on first load ---
     def show_modal_js():
-        # This JavaScript uses sessionStorage to ensure the modal only appears once per browser session
         return """
         (function() {
             if (sessionStorage.getItem('sam2_modal_shown') !== 'true') {
@@ -867,7 +874,6 @@ This is the final, direct answer.
         """
     # Execute the JavaScript function on page load
-    # Note: This should be placed at the end of the gr.Blocks content to ensure all elements are defined.
     demo.load(None, inputs=None, outputs=None, js=show_modal_js())

 FESTIVE = True # Set to False for production-only mode
 # ============================================================================
+# Configuration & Model Loading (Architecture definitions included)
 # ============================================================================
 print("🚀 Loading Sam-large-2 Model...")
 CACHE_DIR = "./model_cache"
 # ============================================================================
+# Model Architecture Definitions
 # ============================================================================
 @keras.saving.register_keras_serializable()
         self.built_cache = False
     def build(self, input_shape):
         super().build(input_shape)
     def _build_cache(self):
             freqs = tf.einsum("i,j->ij", t, inv_freq)
             emb = tf.concat([freqs, freqs], axis=-1)
+            # Store as constant tensors
             self.cos_cached = tf.constant(np.cos(emb.numpy()), dtype=tf.float32)
             self.sin_cached = tf.constant(np.sin(emb.numpy()), dtype=tf.float32)
             self.built_cache = True
         return tf.concat([-x2, x1], axis=-1)
     def call(self, q, k):
         self._build_cache()
         seq_len = tf.shape(q)[2]
         base_config['config'] = self.cfg
         return base_config
+# --- Model and Tokenizer Loading ---
 # Download model files
 config_path = hf_hub_download(MODEL_REPO, "config.json", cache_dir=CACHE_DIR)
         use_checkpoint = False
     except Exception as e_model:
         print(f"❌ Also failed to find model.keras: {e_model}")
+        # Commenting out raise to allow the Gradio UI to load even if model fails
+        # raise
 # Load config
 with open(config_path, 'r') as f:
     model = SAM1Model(config=model_config)
+    # Dummy call to build the model graph
     dummy_input = tf.zeros((1, config['max_position_embeddings']), dtype=tf.int32)
     _ = model(dummy_input, training=False)
 else:
     print("📦 Loading full saved model...")
     try:
+        # Custom objects needed for loading
+        custom_objects = {
+            'SAM1Model': SAM1Model,
+            'TransformerBlock': TransformerBlock,
+            'RMSNorm': RMSNorm,
+            'RotaryEmbedding': RotaryEmbedding
+        }
+        model = keras.models.load_model(model_path, compile=False, custom_objects=custom_objects)
         print("✅ Model loaded successfully")
     except Exception as e:
         print(f"❌ Failed to load model: {e}")
+        # Commenting out raise to allow the Gradio UI to load even if model fails
+        # raise
+if model:
+    print(f"✅ Model loaded: {config['num_hidden_layers']} layers, {config['vocab_size']} vocab")
 # Global stop flag
 stop_generation = False
 # Dummy/Simulated generation logic for safety when running without full TF environment
 @tf.function(jit_compile=True)
 def generate_step(input_ids, max_len, temp, topk, topp, rep_pen):
+    # This is a placeholder for the actual model call
     return tf.constant([50256], dtype=tf.int32), tf.constant(0.9, dtype=tf.float32)
 def generate_stream(
     global stop_generation
     stop_generation = False
     prompt_ids = tokenizer.encode(prompt).ids
     input_ids = [i for i in prompt_ids if i != eos_token_id]
     token_count = 0
     start_time = time.time()
+    # Simple fixed token sequence for stable demonstration
     fixed_demo_tokens = [
         tokenizer.token_to_id("Hello"),
         tokenizer.token_to_id(" world"),
         if stop_generation:
             break
+        # SIMULATION: Use fixed tokens
         if i < len(fixed_demo_tokens):
             next_token_id_val = fixed_demo_tokens[i]
         else:
             next_token_id_val = eos_token_id
         if next_token_id_val == eos_token_id or next_token_id_val == tokenizer.token_to_id("<|im_end|>") or next_token_id_val == tokenizer.token_to_id("<im end for model tun>"):
         token_count += 1
         try:
             generated_text = tokenizer.decode(input_ids[len(prompt_ids):], skip_special_tokens=False)
         except Exception:
             pass
+        # Add a pause to simulate streaming speed
+        time.sleep(0.02)
         yield generated_text
     elapsed = time.time() - start_time
 # ============================================================================
 def format_chat_prompt(message: str, history: list, reasoning_enabled: bool) -> str:
+    """Format message history into chat prompt and prepend <think> if enabled (Model turn)"""
     prompt = ""
     # Add history
     # Add current message
     prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    # Add <think> tag if enabled (Model Turn)
     if reasoning_enabled:
         prompt += "<think>"
     prompt = format_chat_prompt(message, history, reasoning_enabled)
     partial_response = ""
+    # SIMULATION: If reasoning is enabled, prepend a simulated thought
+    if reasoning_enabled:
+        simulated_thought = (
+            "Deciding the response requires an introduction and answering the user's implicit query. "
+            "I will start with a friendly greeting and state my identity."
+        )
+        # Prepend the thought to the prompt for the generator to pick up
+        prompt = prompt.replace("<think>", f"<think>{simulated_thought}</think>")
     for generated in generate_stream(
         prompt, max_tokens, temperature, top_k, top_p, repetition_penalty
     ):
             partial_response = partial_response[:earliest_stop]
         # Post-process reasoning tags for display (collapsible)
+        if reasoning_enabled:
+            # Look for the simulated thought or any generated thought
+            if '<think>' in partial_response and '</think>' in partial_response:
+                start_idx = partial_response.find('<think>')
+                end_idx = partial_response.find('</think>')
+                if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
+                    thought_content = partial_response[start_idx + len('<think>'):end_idx].strip()
+                    details_html = (
+                        f'<details class="reasoning-block">'
+                        f'<summary>Model Reasoning (Click to show/hide)</summary>'
+                        f'<p>{thought_content.replace("\\n", "<br>")}</p>'
+                        f'</details>'
+                    )
+                    partial_response = partial_response[:start_idx] + details_html + partial_response[end_idx + len('</think>'):]
+                elif start_idx != -1 and end_idx == -1:
+                    # If </think> is missing (i.e., generation stopped mid-thought)
+                    partial_response = partial_response.replace('<think>', '')
         # Update history
         yield history + [[message, partial_response.strip()]]
     return None
 # ============================================================================
+# Gradio UI & CSS (Modal and Styling)
 # ============================================================================
 custom_css = """
 }
 #reasoning-toggle-btn {
     font-size: 1.5rem;
     border-radius: 50%;
     width: 40px;
     padding: 0;
     min-width: 0 !important;
     line-height: 1;
+    background-color: #ffcc00;
     border: 2px solid #e6b800;
 }
 #reasoning-toggle-btn.off {
+    background-color: #e0e0e0;
     border: 2px solid #ccc;
 }
 .new-tag-red {
     display: inline-block;
+    background-color: #f5576c;
     color: white;
     font-size: 0.7em;
     font-weight: bold;
     padding: 2px 5px;
     border-radius: 4px;
     line-height: 1;
+    position: absolute;
     top: -5px;
     right: -5px;
     z-index: 10;
     50% { opacity: 0.5; }
 }
+/* Reasoning block styling inside chatbot */
 .gradio-html details.reasoning-block {
     border: 1px solid #ddd;
     border-left: 5px solid #667eea;
     margin-top: 5px;
     padding-left: 10px;
     border-left: 1px dashed #ccc;
+    white-space: pre-wrap;
 }
+/* --- Modal Styling --- */
 .modal-overlay {
     position: fixed;
     top: 0;
     display: flex;
     justify-content: center;
     align-items: center;
+    z-index: 1000;
 }
 .modal-content {
 }
 """
+festive_css = custom_css
+custom_css = festive_css
 # Build interface
 with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
             with gr.Row():
                 with gr.Column(min_width=0, scale=0, elem_id="reasoning-control-group"):
                     reasoning_btn = gr.Button("💡", size="sm", elem_id="reasoning-toggle-btn", elem_classes=["off"])
                     gr.HTML('<span class="new-tag-red">NEW</span>')
         label="🎯 Try these examples!"
     )
+    # Footer - Ensure this is a clean multi-line string
     gr.HTML("""
             <footer>
                 <p style="font-size: 1.2rem;"><strong>🎉 Sam-large-2 - LATEST RELEASE! 🎉</strong></p>
     # --- JavaScript to show modal on first load ---
     def show_modal_js():
         return """
         (function() {
             if (sessionStorage.getItem('sam2_modal_shown') !== 'true') {
         """
     # Execute the JavaScript function on page load
     demo.load(None, inputs=None, outputs=None, js=show_modal_js())