""" SuperCoder - Unified Application All-in-one file containing Gradio UI, API server, tunnel support, and AI logic. """ import os import sys import time import uuid import argparse import subprocess import traceback import requests import json from pathlib import Path from typing import Optional, List, Dict, Any, Generator, Tuple from collections import defaultdict from functools import partial from multiprocessing import Process import gradio as gr from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel import uvicorn # Import config (only external dependency) from config import * # ============================================================================ # SERVER MANAGER - llama.cpp server lifecycle # ============================================================================ _server_process = None _server_info = {} def check_server_health() -> bool: try: # Check if Ollama is responding response = requests.get(f"{LLAMA_SERVER_URL}/api/tags", timeout=2) return response.status_code == 200 and len(response.json().get("models", [])) > 0 except: return False def start_llama_server() -> bool: global _server_process, _server_info if _server_process and check_server_health(): return True print(f"\nšŸš€ Starting llama.cpp server on {LLAMA_SERVER_URL}") try: cmd = [ LLAMA_SERVER_PATH, "-hf", LLAMA_MODEL, "-c", str(MODEL_CONTEXT_WINDOW), "-t", str(MODEL_THREADS), "-ngl", str(MODEL_GPU_LAYERS), "--host", LLAMA_SERVER_HOST, "--port", str(LLAMA_SERVER_PORT) ] _server_process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _server_info = {'pid': _server_process.pid, 'url': LLAMA_SERVER_URL} # Wait for ready for _ in range(SERVER_STARTUP_TIMEOUT * 2): if check_server_health(): print(f"āœ… Server ready (PID: {_server_process.pid})") return True time.sleep(0.5) return False except Exception as e: print(f"āŒ Server start failed: {e}") return False def stop_llama_server(): global _server_process if _server_process: _server_process.terminate() _server_process.wait() _server_process = None def get_llm(): return True if check_server_health() else None def get_model_info(): return _server_info.copy() # ============================================================================ # SESSION MANAGER - Chat history # ============================================================================ SESSION_STORE = {} SESSION_METADATA = defaultdict(dict) def get_session_id(request: gr.Request) -> str: return request.session_hash def get_history(session_id: str, create_if_missing: bool = False) -> List[Dict]: if session_id not in SESSION_STORE and create_if_missing: SESSION_STORE[session_id] = [] return SESSION_STORE.get(session_id, []) def add_to_history(session_id: str, role: str, text: str): history = get_history(session_id, create_if_missing=True) history.append({"role": role, "text": text, "timestamp": time.time()}) def clear_history(session_id: str): if session_id in SESSION_STORE: SESSION_STORE[session_id] = [] def convert_history_to_gradio_messages(history: List[Dict]) -> List[Dict]: return [{"role": msg["role"], "content": msg["text"]} for msg in history] def calculate_safe_max_tokens(history: List[Dict], requested: int, max_context: int) -> int: history_chars = sum(len(msg["text"]) for msg in history) estimated_tokens = history_chars // 4 available = max_context - estimated_tokens - SYSTEM_OVERHEAD_TOKENS return max(min(requested, available, SAFE_MAX_TOKENS_CAP), MIN_TOKENS) def get_recent_history(session_id: str, max_messages: int = 10) -> List[Dict]: history = get_history(session_id) return history[-max_messages:] if len(history) > max_messages else history def update_session_activity(session_id: str): SESSION_METADATA[session_id]['last_activity'] = time.time() # ============================================================================ # GENERATION - AI response generation # ============================================================================ def generate_response_stream(session_id: str, user_message: str, max_tokens: int, temperature: float, stream: bool = True) -> Generator[str, None, None]: if not get_llm(): yield "āš ļø Server not running" return update_session_activity(session_id) recent_history = get_recent_history(session_id, max_messages=6) safe_tokens = calculate_safe_max_tokens(recent_history, max_tokens, MODEL_CONTEXT_WINDOW) messages = [{"role": "system", "content": SYSTEM_PROMPT}] for msg in recent_history: messages.append({"role": msg["role"], "content": msg["text"]}) messages.append({"role": "user", "content": user_message}) try: payload = { "messages": messages, "max_tokens": safe_tokens, "temperature": max(0.01, temperature), "top_p": DEFAULT_TOP_P, "stream": stream } if stream: response = requests.post(f"{LLAMA_SERVER_URL}/v1/chat/completions", json=payload, stream=True, timeout=300) full_response = "" for line in response.iter_lines(): if line: line_text = line.decode('utf-8') if line_text.startswith('data: '): line_text = line_text[6:] if line_text.strip() == '[DONE]': break try: chunk = json.loads(line_text) content = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "") if content: full_response += content yield full_response.strip() except: continue else: # Use Ollama API format instead of OpenAI format ollama_payload = { "model": LLAMA_MODEL, "messages": messages, "stream": False } response = requests.post(f"{LLAMA_SERVER_URL}/api/chat", json=ollama_payload, timeout=300) yield response.json()["message"]["content"].strip() except Exception as e: yield f"āš ļø Error: {str(e)}" # ============================================================================ # GRADIO UI COMPONENTS # ============================================================================ def create_gradio_interface(error_msg: Optional[str] = None): with gr.Blocks(title=APP_TITLE, theme=gr.themes.Soft(primary_hue=PRIMARY_HUE)) as demo: gr.Markdown(f"# šŸ¤– {APP_TITLE}\n### {APP_DESCRIPTION}\n---") if error_msg: gr.Markdown(f"āš ļø {error_msg}") with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(label="šŸ’¬ Conversation", height=CHAT_HEIGHT, type="messages", show_copy_button=True) with gr.Row(): txt_input = gr.Textbox(placeholder="Ask me about code...", show_label=False, scale=5, lines=2) send_btn = gr.Button("Send šŸš€", scale=1, variant="primary") with gr.Column(scale=1): gr.Markdown("### āš™ļø Settings") temp_slider = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, label="šŸŒ”ļø Temperature") tokens_slider = gr.Slider(MIN_TOKENS, SAFE_MAX_TOKENS_CAP, value=DEFAULT_MAX_TOKENS, step=128, label="šŸ“ Max Tokens") stream_checkbox = gr.Checkbox(label="⚔ Stream", value=True) clear_btn = gr.Button("šŸ—‘ļø Clear", variant="stop", size="sm") session_state = gr.State() # Event handlers def handle_message(session_id, msg, temp, tokens, stream, request: gr.Request): session_id = session_id or get_session_id(request) if not msg.strip(): return session_id, convert_history_to_gradio_messages(get_history(session_id)), "" add_to_history(session_id, "user", msg) yield session_id, convert_history_to_gradio_messages(get_history(session_id)), "" full_response = "" for partial in generate_response_stream(session_id, msg, tokens, temp, stream): full_response = partial temp_hist = get_history(session_id).copy() temp_hist.append({"role": "assistant", "text": full_response}) yield session_id, convert_history_to_gradio_messages(temp_hist), "" add_to_history(session_id, "assistant", full_response) yield session_id, convert_history_to_gradio_messages(get_history(session_id)), "" def handle_clear(session_id, request: gr.Request): session_id = session_id or get_session_id(request) clear_history(session_id) return session_id, [], "" txt_input.submit(handle_message, [session_state, txt_input, temp_slider, tokens_slider, stream_checkbox], [session_state, chatbot, txt_input]) send_btn.click(handle_message, [session_state, txt_input, temp_slider, tokens_slider, stream_checkbox], [session_state, chatbot, txt_input]) clear_btn.click(handle_clear, [session_state], [session_state, chatbot, txt_input]) return demo # ============================================================================ # FASTAPI SERVER # ============================================================================ api_app = FastAPI(title="SuperCoder API") api_app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) api_sessions = {} class ChatMessage(BaseModel): role: str content: str class ChatRequest(BaseModel): messages: List[ChatMessage] temperature: Optional[float] = 0.1 max_tokens: Optional[int] = 512 class ChatResponse(BaseModel): response: str session_id: str @api_app.get("/health") async def health(): return {"status": "ok" if get_llm() else "model_not_loaded"} @api_app.post("/api/chat", response_model=ChatResponse) async def chat(request: ChatRequest): if not get_llm(): raise HTTPException(503, "Model not loaded") session_id = str(uuid.uuid4()) api_sessions[session_id] = [] user_message = request.messages[-1].content api_sessions[session_id].append({"role": "user", "text": user_message}) full_response = "" for partial in generate_response_stream(session_id, user_message, request.max_tokens, request.temperature, False): full_response = partial api_sessions[session_id].append({"role": "assistant", "text": full_response}) return ChatResponse(response=full_response, session_id=session_id) def run_api_server(): uvicorn.run(api_app, host="0.0.0.0", port=8000, log_level="info") # ============================================================================ # TUNNEL SUPPORT # ============================================================================ def start_ngrok_tunnel(port: int = 8000) -> Optional[str]: try: subprocess.run(["which", "ngrok"], capture_output=True, check=True) subprocess.Popen(["ngrok", "http", str(port)], stdout=subprocess.PIPE) time.sleep(3) response = requests.get("http://127.0.0.1:4040/api/tunnels", timeout=5) tunnels = response.json() if tunnels.get("tunnels"): url = tunnels["tunnels"][0]["public_url"] print(f"āœ… Tunnel: {url}") return url except: print("āŒ ngrok not found. Install: brew install ngrok") return None def start_cloudflare_tunnel(port: int = 8000) -> Optional[str]: try: subprocess.run(["which", "cloudflared"], capture_output=True, check=True) proc = subprocess.Popen(["cloudflared", "tunnel", "--url", f"http://localhost:{port}"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) time.sleep(3) for _ in range(30): line = proc.stdout.readline() if "trycloudflare.com" in line: import re urls = re.findall(r'https://[^\s]+\.trycloudflare\.com', line) if urls: print(f"āœ… Tunnel: {urls[0]}") return urls[0] time.sleep(1) except: print("āŒ cloudflared not found. Install: brew install cloudflared") return None # ============================================================================ # MAIN LAUNCHER # ============================================================================ def main(): parser = argparse.ArgumentParser(description="SuperCoder - All-in-One AI Coding Assistant") parser.add_argument("--mode", choices=["gradio", "api", "both"], default="gradio", help="Run mode: gradio (UI), api (server), or both") parser.add_argument("--tunnel", choices=["ngrok", "cloudflare"], help="Start tunnel for public access") parser.add_argument("--no-server", action="store_true", help="Don't start llama.cpp server (assume already running)") args = parser.parse_args() print("╔════════════════════════════════════════════════╗") print("ā•‘ SuperCoder - Unified Launcher ā•‘") print("ā•šā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•") # Start llama.cpp server if not args.no_server: success = start_llama_server() error_msg = None if success else "Failed to start llama.cpp server" else: error_msg = None # Run selected mode if args.mode == "gradio": print(f"\nšŸ“Œ Mode: Gradio UI\n🌐 Access: http://localhost:{SERVER_PORT}\n") demo = create_gradio_interface(error_msg) demo.launch(server_name=SERVER_NAME, server_port=SERVER_PORT) elif args.mode == "api": print(f"\nšŸ“Œ Mode: API Server\nšŸ“” API: http://localhost:8000/api/chat\n") if args.tunnel: api_proc = Process(target=run_api_server) api_proc.start() time.sleep(3) if args.tunnel == "ngrok": start_ngrok_tunnel(8000) else: start_cloudflare_tunnel(8000) try: api_proc.join() except KeyboardInterrupt: api_proc.terminate() else: run_api_server() elif args.mode == "both": print(f"\nšŸ“Œ Mode: Both Gradio + API\nšŸŽØ UI: http://localhost:{SERVER_PORT}\nšŸ“” API: http://localhost:8000\n") gradio_proc = Process(target=lambda: create_gradio_interface(error_msg).launch( server_name=SERVER_NAME, server_port=SERVER_PORT)) api_proc = Process(target=run_api_server) gradio_proc.start() api_proc.start() if args.tunnel: time.sleep(3) if args.tunnel == "ngrok": start_ngrok_tunnel(8000) else: start_cloudflare_tunnel(8000) try: gradio_proc.join() api_proc.join() except KeyboardInterrupt: gradio_proc.terminate() api_proc.terminate() if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\nšŸ‘‹ Shutting down...") stop_llama_server()