|
|
""" |
|
|
SuperCoder - Unified Application |
|
|
All-in-one file containing Gradio UI, API server, tunnel support, and AI logic. |
|
|
""" |
|
|
import os |
|
|
import sys |
|
|
import time |
|
|
import uuid |
|
|
import argparse |
|
|
import subprocess |
|
|
import traceback |
|
|
import requests |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import Optional, List, Dict, Any, Generator, Tuple |
|
|
from collections import defaultdict |
|
|
from functools import partial |
|
|
from multiprocessing import Process |
|
|
|
|
|
import gradio as gr |
|
|
from fastapi import FastAPI, HTTPException |
|
|
from fastapi.middleware.cors import CORSMiddleware |
|
|
from pydantic import BaseModel |
|
|
import uvicorn |
|
|
|
|
|
|
|
|
from config import * |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_server_process = None |
|
|
_server_info = {} |
|
|
|
|
|
def check_server_health() -> bool: |
|
|
try: |
|
|
|
|
|
response = requests.get(f"{LLAMA_SERVER_URL}/api/tags", timeout=2) |
|
|
return response.status_code == 200 and len(response.json().get("models", [])) > 0 |
|
|
except: |
|
|
return False |
|
|
|
|
|
def start_llama_server() -> bool: |
|
|
global _server_process, _server_info |
|
|
|
|
|
if _server_process and check_server_health(): |
|
|
return True |
|
|
|
|
|
print(f"\nπ Starting llama.cpp server on {LLAMA_SERVER_URL}") |
|
|
|
|
|
try: |
|
|
cmd = [ |
|
|
LLAMA_SERVER_PATH, "-hf", LLAMA_MODEL, |
|
|
"-c", str(MODEL_CONTEXT_WINDOW), |
|
|
"-t", str(MODEL_THREADS), |
|
|
"-ngl", str(MODEL_GPU_LAYERS), |
|
|
"--host", LLAMA_SERVER_HOST, "--port", str(LLAMA_SERVER_PORT) |
|
|
] |
|
|
|
|
|
_server_process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
|
|
_server_info = {'pid': _server_process.pid, 'url': LLAMA_SERVER_URL} |
|
|
|
|
|
|
|
|
for _ in range(SERVER_STARTUP_TIMEOUT * 2): |
|
|
if check_server_health(): |
|
|
print(f"β
Server ready (PID: {_server_process.pid})") |
|
|
return True |
|
|
time.sleep(0.5) |
|
|
|
|
|
return False |
|
|
except Exception as e: |
|
|
print(f"β Server start failed: {e}") |
|
|
return False |
|
|
|
|
|
def stop_llama_server(): |
|
|
global _server_process |
|
|
if _server_process: |
|
|
_server_process.terminate() |
|
|
_server_process.wait() |
|
|
_server_process = None |
|
|
|
|
|
def get_llm(): |
|
|
return True if check_server_health() else None |
|
|
|
|
|
def get_model_info(): |
|
|
return _server_info.copy() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SESSION_STORE = {} |
|
|
SESSION_METADATA = defaultdict(dict) |
|
|
|
|
|
def get_session_id(request: gr.Request) -> str: |
|
|
return request.session_hash |
|
|
|
|
|
def get_history(session_id: str, create_if_missing: bool = False) -> List[Dict]: |
|
|
if session_id not in SESSION_STORE and create_if_missing: |
|
|
SESSION_STORE[session_id] = [] |
|
|
return SESSION_STORE.get(session_id, []) |
|
|
|
|
|
def add_to_history(session_id: str, role: str, text: str): |
|
|
history = get_history(session_id, create_if_missing=True) |
|
|
history.append({"role": role, "text": text, "timestamp": time.time()}) |
|
|
|
|
|
def clear_history(session_id: str): |
|
|
if session_id in SESSION_STORE: |
|
|
SESSION_STORE[session_id] = [] |
|
|
|
|
|
def convert_history_to_gradio_messages(history: List[Dict]) -> List[Dict]: |
|
|
return [{"role": msg["role"], "content": msg["text"]} for msg in history] |
|
|
|
|
|
def calculate_safe_max_tokens(history: List[Dict], requested: int, max_context: int) -> int: |
|
|
history_chars = sum(len(msg["text"]) for msg in history) |
|
|
estimated_tokens = history_chars // 4 |
|
|
available = max_context - estimated_tokens - SYSTEM_OVERHEAD_TOKENS |
|
|
return max(min(requested, available, SAFE_MAX_TOKENS_CAP), MIN_TOKENS) |
|
|
|
|
|
def get_recent_history(session_id: str, max_messages: int = 10) -> List[Dict]: |
|
|
history = get_history(session_id) |
|
|
return history[-max_messages:] if len(history) > max_messages else history |
|
|
|
|
|
def update_session_activity(session_id: str): |
|
|
SESSION_METADATA[session_id]['last_activity'] = time.time() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_response_stream(session_id: str, user_message: str, max_tokens: int, |
|
|
temperature: float, stream: bool = True) -> Generator[str, None, None]: |
|
|
if not get_llm(): |
|
|
yield "β οΈ Server not running" |
|
|
return |
|
|
|
|
|
update_session_activity(session_id) |
|
|
recent_history = get_recent_history(session_id, max_messages=6) |
|
|
safe_tokens = calculate_safe_max_tokens(recent_history, max_tokens, MODEL_CONTEXT_WINDOW) |
|
|
|
|
|
messages = [{"role": "system", "content": SYSTEM_PROMPT}] |
|
|
for msg in recent_history: |
|
|
messages.append({"role": msg["role"], "content": msg["text"]}) |
|
|
messages.append({"role": "user", "content": user_message}) |
|
|
|
|
|
try: |
|
|
payload = { |
|
|
"messages": messages, "max_tokens": safe_tokens, |
|
|
"temperature": max(0.01, temperature), |
|
|
"top_p": DEFAULT_TOP_P, "stream": stream |
|
|
} |
|
|
|
|
|
if stream: |
|
|
response = requests.post(f"{LLAMA_SERVER_URL}/v1/chat/completions", |
|
|
json=payload, stream=True, timeout=300) |
|
|
full_response = "" |
|
|
for line in response.iter_lines(): |
|
|
if line: |
|
|
line_text = line.decode('utf-8') |
|
|
if line_text.startswith('data: '): |
|
|
line_text = line_text[6:] |
|
|
if line_text.strip() == '[DONE]': |
|
|
break |
|
|
try: |
|
|
chunk = json.loads(line_text) |
|
|
content = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "") |
|
|
if content: |
|
|
full_response += content |
|
|
yield full_response.strip() |
|
|
except: |
|
|
continue |
|
|
else: |
|
|
|
|
|
ollama_payload = { |
|
|
"model": LLAMA_MODEL, |
|
|
"messages": messages, |
|
|
"stream": False |
|
|
} |
|
|
response = requests.post(f"{LLAMA_SERVER_URL}/api/chat", |
|
|
json=ollama_payload, timeout=300) |
|
|
yield response.json()["message"]["content"].strip() |
|
|
|
|
|
except Exception as e: |
|
|
yield f"β οΈ Error: {str(e)}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_gradio_interface(error_msg: Optional[str] = None): |
|
|
with gr.Blocks(title=APP_TITLE, theme=gr.themes.Soft(primary_hue=PRIMARY_HUE)) as demo: |
|
|
gr.Markdown(f"# π€ {APP_TITLE}\n### {APP_DESCRIPTION}\n---") |
|
|
|
|
|
if error_msg: |
|
|
gr.Markdown(f"β οΈ {error_msg}") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=3): |
|
|
chatbot = gr.Chatbot(label="π¬ Conversation", height=CHAT_HEIGHT, |
|
|
type="messages", show_copy_button=True) |
|
|
with gr.Row(): |
|
|
txt_input = gr.Textbox(placeholder="Ask me about code...", |
|
|
show_label=False, scale=5, lines=2) |
|
|
send_btn = gr.Button("Send π", scale=1, variant="primary") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### βοΈ Settings") |
|
|
temp_slider = gr.Slider(0.0, 1.0, value=DEFAULT_TEMPERATURE, step=0.05, |
|
|
label="π‘οΈ Temperature") |
|
|
tokens_slider = gr.Slider(MIN_TOKENS, SAFE_MAX_TOKENS_CAP, |
|
|
value=DEFAULT_MAX_TOKENS, step=128, label="π Max Tokens") |
|
|
stream_checkbox = gr.Checkbox(label="β‘ Stream", value=True) |
|
|
clear_btn = gr.Button("ποΈ Clear", variant="stop", size="sm") |
|
|
|
|
|
session_state = gr.State() |
|
|
|
|
|
|
|
|
def handle_message(session_id, msg, temp, tokens, stream, request: gr.Request): |
|
|
session_id = session_id or get_session_id(request) |
|
|
if not msg.strip(): |
|
|
return session_id, convert_history_to_gradio_messages(get_history(session_id)), "" |
|
|
|
|
|
add_to_history(session_id, "user", msg) |
|
|
yield session_id, convert_history_to_gradio_messages(get_history(session_id)), "" |
|
|
|
|
|
full_response = "" |
|
|
for partial in generate_response_stream(session_id, msg, tokens, temp, stream): |
|
|
full_response = partial |
|
|
temp_hist = get_history(session_id).copy() |
|
|
temp_hist.append({"role": "assistant", "text": full_response}) |
|
|
yield session_id, convert_history_to_gradio_messages(temp_hist), "" |
|
|
|
|
|
add_to_history(session_id, "assistant", full_response) |
|
|
yield session_id, convert_history_to_gradio_messages(get_history(session_id)), "" |
|
|
|
|
|
def handle_clear(session_id, request: gr.Request): |
|
|
session_id = session_id or get_session_id(request) |
|
|
clear_history(session_id) |
|
|
return session_id, [], "" |
|
|
|
|
|
txt_input.submit(handle_message, [session_state, txt_input, temp_slider, tokens_slider, stream_checkbox], |
|
|
[session_state, chatbot, txt_input]) |
|
|
send_btn.click(handle_message, [session_state, txt_input, temp_slider, tokens_slider, stream_checkbox], |
|
|
[session_state, chatbot, txt_input]) |
|
|
clear_btn.click(handle_clear, [session_state], [session_state, chatbot, txt_input]) |
|
|
|
|
|
return demo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
api_app = FastAPI(title="SuperCoder API") |
|
|
api_app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) |
|
|
|
|
|
api_sessions = {} |
|
|
|
|
|
class ChatMessage(BaseModel): |
|
|
role: str |
|
|
content: str |
|
|
|
|
|
class ChatRequest(BaseModel): |
|
|
messages: List[ChatMessage] |
|
|
temperature: Optional[float] = 0.1 |
|
|
max_tokens: Optional[int] = 512 |
|
|
|
|
|
class ChatResponse(BaseModel): |
|
|
response: str |
|
|
session_id: str |
|
|
|
|
|
@api_app.get("/health") |
|
|
async def health(): |
|
|
return {"status": "ok" if get_llm() else "model_not_loaded"} |
|
|
|
|
|
@api_app.post("/api/chat", response_model=ChatResponse) |
|
|
async def chat(request: ChatRequest): |
|
|
if not get_llm(): |
|
|
raise HTTPException(503, "Model not loaded") |
|
|
|
|
|
session_id = str(uuid.uuid4()) |
|
|
api_sessions[session_id] = [] |
|
|
|
|
|
user_message = request.messages[-1].content |
|
|
api_sessions[session_id].append({"role": "user", "text": user_message}) |
|
|
|
|
|
full_response = "" |
|
|
for partial in generate_response_stream(session_id, user_message, request.max_tokens, |
|
|
request.temperature, False): |
|
|
full_response = partial |
|
|
|
|
|
api_sessions[session_id].append({"role": "assistant", "text": full_response}) |
|
|
return ChatResponse(response=full_response, session_id=session_id) |
|
|
|
|
|
def run_api_server(): |
|
|
uvicorn.run(api_app, host="0.0.0.0", port=8000, log_level="info") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def start_ngrok_tunnel(port: int = 8000) -> Optional[str]: |
|
|
try: |
|
|
subprocess.run(["which", "ngrok"], capture_output=True, check=True) |
|
|
subprocess.Popen(["ngrok", "http", str(port)], stdout=subprocess.PIPE) |
|
|
time.sleep(3) |
|
|
|
|
|
response = requests.get("http://127.0.0.1:4040/api/tunnels", timeout=5) |
|
|
tunnels = response.json() |
|
|
if tunnels.get("tunnels"): |
|
|
url = tunnels["tunnels"][0]["public_url"] |
|
|
print(f"β
Tunnel: {url}") |
|
|
return url |
|
|
except: |
|
|
print("β ngrok not found. Install: brew install ngrok") |
|
|
return None |
|
|
|
|
|
def start_cloudflare_tunnel(port: int = 8000) -> Optional[str]: |
|
|
try: |
|
|
subprocess.run(["which", "cloudflared"], capture_output=True, check=True) |
|
|
proc = subprocess.Popen(["cloudflared", "tunnel", "--url", f"http://localhost:{port}"], |
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) |
|
|
time.sleep(3) |
|
|
|
|
|
for _ in range(30): |
|
|
line = proc.stdout.readline() |
|
|
if "trycloudflare.com" in line: |
|
|
import re |
|
|
urls = re.findall(r'https://[^\s]+\.trycloudflare\.com', line) |
|
|
if urls: |
|
|
print(f"β
Tunnel: {urls[0]}") |
|
|
return urls[0] |
|
|
time.sleep(1) |
|
|
except: |
|
|
print("β cloudflared not found. Install: brew install cloudflared") |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="SuperCoder - All-in-One AI Coding Assistant") |
|
|
parser.add_argument("--mode", choices=["gradio", "api", "both"], default="gradio", |
|
|
help="Run mode: gradio (UI), api (server), or both") |
|
|
parser.add_argument("--tunnel", choices=["ngrok", "cloudflare"], |
|
|
help="Start tunnel for public access") |
|
|
parser.add_argument("--no-server", action="store_true", |
|
|
help="Don't start llama.cpp server (assume already running)") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
print("ββββββββββββββββββββββββββββββββββββββββββββββββββ") |
|
|
print("β SuperCoder - Unified Launcher β") |
|
|
print("ββββββββββββββββββββββββββββββββββββββββββββββββββ") |
|
|
|
|
|
|
|
|
if not args.no_server: |
|
|
success = start_llama_server() |
|
|
error_msg = None if success else "Failed to start llama.cpp server" |
|
|
else: |
|
|
error_msg = None |
|
|
|
|
|
|
|
|
if args.mode == "gradio": |
|
|
print(f"\nπ Mode: Gradio UI\nπ Access: http://localhost:{SERVER_PORT}\n") |
|
|
demo = create_gradio_interface(error_msg) |
|
|
demo.launch(server_name=SERVER_NAME, server_port=SERVER_PORT) |
|
|
|
|
|
elif args.mode == "api": |
|
|
print(f"\nπ Mode: API Server\nπ‘ API: http://localhost:8000/api/chat\n") |
|
|
|
|
|
if args.tunnel: |
|
|
api_proc = Process(target=run_api_server) |
|
|
api_proc.start() |
|
|
time.sleep(3) |
|
|
|
|
|
if args.tunnel == "ngrok": |
|
|
start_ngrok_tunnel(8000) |
|
|
else: |
|
|
start_cloudflare_tunnel(8000) |
|
|
|
|
|
try: |
|
|
api_proc.join() |
|
|
except KeyboardInterrupt: |
|
|
api_proc.terminate() |
|
|
else: |
|
|
run_api_server() |
|
|
|
|
|
elif args.mode == "both": |
|
|
print(f"\nπ Mode: Both Gradio + API\nπ¨ UI: http://localhost:{SERVER_PORT}\nπ‘ API: http://localhost:8000\n") |
|
|
|
|
|
gradio_proc = Process(target=lambda: create_gradio_interface(error_msg).launch( |
|
|
server_name=SERVER_NAME, server_port=SERVER_PORT)) |
|
|
api_proc = Process(target=run_api_server) |
|
|
|
|
|
gradio_proc.start() |
|
|
api_proc.start() |
|
|
|
|
|
if args.tunnel: |
|
|
time.sleep(3) |
|
|
if args.tunnel == "ngrok": |
|
|
start_ngrok_tunnel(8000) |
|
|
else: |
|
|
start_cloudflare_tunnel(8000) |
|
|
|
|
|
try: |
|
|
gradio_proc.join() |
|
|
api_proc.join() |
|
|
except KeyboardInterrupt: |
|
|
gradio_proc.terminate() |
|
|
api_proc.terminate() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
main() |
|
|
except KeyboardInterrupt: |
|
|
print("\nπ Shutting down...") |
|
|
stop_llama_server() |
|
|
|
|
|
|