Spaces:
Runtime error
Runtime error
Commit
·
016e4dd
1
Parent(s):
a97500b
Update app.py
Browse files
app.py
CHANGED
|
@@ -80,48 +80,51 @@ def flag_last_response(state, model_selector, request: gr.Request):
|
|
| 80 |
vote_last_response(state, "flag", model_selector, request)
|
| 81 |
return ("",) + (disable_btn,) * 3
|
| 82 |
|
| 83 |
-
def regenerate(state, image_process_mode, seg_process_mode):
|
| 84 |
state.messages[-1][-1] = None
|
| 85 |
prev_human_msg = state.messages[-2]
|
| 86 |
if type(prev_human_msg[1]) in (tuple, list):
|
| 87 |
-
prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, prev_human_msg[1][3], seg_process_mode,
|
| 88 |
state.skip_next = False
|
| 89 |
-
return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
|
| 90 |
|
| 91 |
|
| 92 |
def clear_history(request: gr.Request):
|
| 93 |
state = default_conversation.copy()
|
| 94 |
-
return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
|
| 95 |
|
| 96 |
|
| 97 |
def add_text(state, text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode, request: gr.Request):
|
| 98 |
logger.info(f"add_text. len: {len(text)}")
|
| 99 |
if len(text) <= 0 and image is None:
|
| 100 |
state.skip_next = True
|
| 101 |
-
return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
|
| 102 |
if args.moderate:
|
| 103 |
flagged = violates_moderation(text)
|
| 104 |
if flagged:
|
| 105 |
state.skip_next = True
|
| 106 |
-
return (state, state.to_gradio_chatbot(), moderation_msg, None, None) + (
|
| 107 |
no_change_btn,) * 5
|
| 108 |
|
| 109 |
-
text = text[:
|
| 110 |
if image is not None:
|
| 111 |
-
text = text[:
|
| 112 |
if '<image>' not in text:
|
| 113 |
text = '<image>\n' + text
|
| 114 |
if seg is not None:
|
| 115 |
if '<seg>' not in text:
|
| 116 |
text = '<seg>\n' + text
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
-
text = (text, image, image_process_mode, seg, seg_process_mode,
|
| 119 |
if len(state.get_images(return_pil=True)) > 0:
|
| 120 |
state = default_conversation.copy()
|
| 121 |
state.append_message(state.roles[0], text)
|
| 122 |
state.append_message(state.roles[1], None)
|
| 123 |
state.skip_next = False
|
| 124 |
-
return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
|
| 125 |
|
| 126 |
|
| 127 |
def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
|
|
@@ -145,24 +148,6 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
|
|
| 145 |
# Construct prompt
|
| 146 |
prompt = state.get_prompt()
|
| 147 |
|
| 148 |
-
all_images = state.get_images(return_pil=True)
|
| 149 |
-
all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
|
| 150 |
-
for image, hash in zip(all_images, all_image_hash):
|
| 151 |
-
t = datetime.datetime.now()
|
| 152 |
-
filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
|
| 153 |
-
if not os.path.isfile(filename):
|
| 154 |
-
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
| 155 |
-
image.save(filename)
|
| 156 |
-
|
| 157 |
-
all_segs = state.get_segs(return_pil=True)
|
| 158 |
-
all_seg_hash = [hashlib.md5(seg.tobytes()).hexdigest() for seg in all_segs]
|
| 159 |
-
for seg, hash in zip(all_segs, all_seg_hash):
|
| 160 |
-
t = datetime.datetime.now()
|
| 161 |
-
filename = os.path.join(LOGDIR, "serve_segs", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
|
| 162 |
-
if not os.path.isfile(filename):
|
| 163 |
-
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
| 164 |
-
seg.save(filename)
|
| 165 |
-
|
| 166 |
# Make requests
|
| 167 |
pload = {
|
| 168 |
"model": model_name,
|
|
@@ -171,13 +156,15 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
|
|
| 171 |
"top_p": float(top_p),
|
| 172 |
"max_new_tokens": min(int(max_new_tokens), 1536),
|
| 173 |
"stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
|
| 174 |
-
"images": f'List of {len(state.get_images())}
|
| 175 |
-
"segs": f'List of {len(state.get_segs())}
|
|
|
|
| 176 |
}
|
| 177 |
logger.info(f"==== request ====\n{pload}")
|
| 178 |
|
| 179 |
pload['images'] = state.get_images()
|
| 180 |
pload['segs'] = state.get_segs()
|
|
|
|
| 181 |
|
| 182 |
state.messages[-1][-1] = "▌"
|
| 183 |
yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
|
|
@@ -207,24 +194,8 @@ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request:
|
|
| 207 |
|
| 208 |
state.messages[-1][-1] = state.messages[-1][-1][:-1]
|
| 209 |
yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
|
| 210 |
-
|
| 211 |
-
finish_tstamp = time.time()
|
| 212 |
logger.info(f"{output}")
|
| 213 |
|
| 214 |
-
with open(get_conv_log_filename(), "a") as fout:
|
| 215 |
-
data = {
|
| 216 |
-
"tstamp": round(finish_tstamp, 4),
|
| 217 |
-
"type": "chat",
|
| 218 |
-
"model": model_name,
|
| 219 |
-
"start": round(start_tstamp, 4),
|
| 220 |
-
"finish": round(start_tstamp, 4),
|
| 221 |
-
"state": state.dict(),
|
| 222 |
-
"images": all_image_hash,
|
| 223 |
-
"segs": all_seg_hash,
|
| 224 |
-
"ip": request.client.host,
|
| 225 |
-
}
|
| 226 |
-
fout.write(json.dumps(data) + "\n")
|
| 227 |
-
|
| 228 |
|
| 229 |
title = "<h1 style='margin-bottom: -10px; text-align: center'>VCoder: Versatile Vision Encoders for Multimodal Large Language Models</h1>"
|
| 230 |
# style='
|
|
@@ -284,6 +255,12 @@ def build_demo(embed_mode):
|
|
| 284 |
["Crop", "Resize", "Pad", "Default"],
|
| 285 |
value="Default",
|
| 286 |
label="Preprocess for non-square Seg Map", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
with gr.Accordion("Parameters", open=False) as parameter_row:
|
| 289 |
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature",)
|
|
@@ -307,13 +284,8 @@ def build_demo(embed_mode):
|
|
| 307 |
|
| 308 |
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 309 |
gr.Examples(examples=[
|
| 310 |
-
[f"{cur_dir}/examples/
|
| 311 |
-
|
| 312 |
-
[f"{cur_dir}/examples/friends.jpg", f"{cur_dir}/examples/friends_pan.png", "Can you count the number of people in the image?", "0.8", "0.9"],
|
| 313 |
-
[f"{cur_dir}/examples/friends.jpg", f"{cur_dir}/examples/friends_pan.png", "What is happening in the image?", "0.8", "0.9"],
|
| 314 |
-
[f"{cur_dir}/examples/suits.jpg", f"{cur_dir}/examples/suits_pan.png", "What objects can be seen in the image?", "0.5", "0.5"],
|
| 315 |
-
[f"{cur_dir}/examples/suits.jpg", f"{cur_dir}/examples/suits_ins.png", "What objects can be seen in the image?", "0.5", "0.5"],
|
| 316 |
-
], inputs=[imagebox, segbox, textbox, temperature, top_p])
|
| 317 |
|
| 318 |
if not embed_mode:
|
| 319 |
gr.Markdown(tos_markdown)
|
|
@@ -327,16 +299,16 @@ def build_demo(embed_mode):
|
|
| 327 |
[state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
|
| 328 |
flag_btn.click(flag_last_response,
|
| 329 |
[state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
|
| 330 |
-
regenerate_btn.click(regenerate, [state, image_process_mode, seg_process_mode],
|
| 331 |
-
[state, chatbot, textbox, imagebox, segbox] + btn_list).then(
|
| 332 |
http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
|
| 333 |
[state, chatbot] + btn_list)
|
| 334 |
-
clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, segbox] + btn_list)
|
| 335 |
|
| 336 |
-
textbox.submit(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode], [state, chatbot, textbox, imagebox, segbox] + btn_list
|
| 337 |
).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
|
| 338 |
[state, chatbot] + btn_list)
|
| 339 |
-
submit_btn.click(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode], [state, chatbot, textbox, imagebox, segbox] + btn_list
|
| 340 |
).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
|
| 341 |
[state, chatbot] + btn_list)
|
| 342 |
|
|
|
|
| 80 |
vote_last_response(state, "flag", model_selector, request)
|
| 81 |
return ("",) + (disable_btn,) * 3
|
| 82 |
|
| 83 |
+
def regenerate(state, image_process_mode, seg_process_mode, depth_process_mode):
|
| 84 |
state.messages[-1][-1] = None
|
| 85 |
prev_human_msg = state.messages[-2]
|
| 86 |
if type(prev_human_msg[1]) in (tuple, list):
|
| 87 |
+
prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, prev_human_msg[1][3], seg_process_mode, prev_human_msg[1][5], depth_process_mode)
|
| 88 |
state.skip_next = False
|
| 89 |
+
return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
|
| 90 |
|
| 91 |
|
| 92 |
def clear_history(request: gr.Request):
|
| 93 |
state = default_conversation.copy()
|
| 94 |
+
return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
|
| 95 |
|
| 96 |
|
| 97 |
def add_text(state, text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode, request: gr.Request):
|
| 98 |
logger.info(f"add_text. len: {len(text)}")
|
| 99 |
if len(text) <= 0 and image is None:
|
| 100 |
state.skip_next = True
|
| 101 |
+
return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (no_change_btn,) * 5
|
| 102 |
if args.moderate:
|
| 103 |
flagged = violates_moderation(text)
|
| 104 |
if flagged:
|
| 105 |
state.skip_next = True
|
| 106 |
+
return (state, state.to_gradio_chatbot(), moderation_msg, None, None, None, None) + (
|
| 107 |
no_change_btn,) * 5
|
| 108 |
|
| 109 |
+
text = text[:1200] # Hard cut-off
|
| 110 |
if image is not None:
|
| 111 |
+
text = text[:864] # Hard cut-off for images
|
| 112 |
if '<image>' not in text:
|
| 113 |
text = '<image>\n' + text
|
| 114 |
if seg is not None:
|
| 115 |
if '<seg>' not in text:
|
| 116 |
text = '<seg>\n' + text
|
| 117 |
+
if depth is not None:
|
| 118 |
+
if '<depth>' not in text:
|
| 119 |
+
text = '<depth>\n' + text
|
| 120 |
|
| 121 |
+
text = (text, image, image_process_mode, seg, seg_process_mode, depth, depth_process_mode)
|
| 122 |
if len(state.get_images(return_pil=True)) > 0:
|
| 123 |
state = default_conversation.copy()
|
| 124 |
state.append_message(state.roles[0], text)
|
| 125 |
state.append_message(state.roles[1], None)
|
| 126 |
state.skip_next = False
|
| 127 |
+
return (state, state.to_gradio_chatbot(), "", None, None, None, None) + (disable_btn,) * 5
|
| 128 |
|
| 129 |
|
| 130 |
def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
|
|
|
|
| 148 |
# Construct prompt
|
| 149 |
prompt = state.get_prompt()
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# Make requests
|
| 152 |
pload = {
|
| 153 |
"model": model_name,
|
|
|
|
| 156 |
"top_p": float(top_p),
|
| 157 |
"max_new_tokens": min(int(max_new_tokens), 1536),
|
| 158 |
"stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT] else state.sep2,
|
| 159 |
+
"images": f'List of {len(state.get_images())}',
|
| 160 |
+
"segs": f'List of {len(state.get_segs())}',
|
| 161 |
+
"depths": f'List of {len(state.get_depths())}',
|
| 162 |
}
|
| 163 |
logger.info(f"==== request ====\n{pload}")
|
| 164 |
|
| 165 |
pload['images'] = state.get_images()
|
| 166 |
pload['segs'] = state.get_segs()
|
| 167 |
+
pload['depths'] = state.get_depths()
|
| 168 |
|
| 169 |
state.messages[-1][-1] = "▌"
|
| 170 |
yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
|
|
|
|
| 194 |
|
| 195 |
state.messages[-1][-1] = state.messages[-1][-1][:-1]
|
| 196 |
yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
|
|
|
|
|
|
|
| 197 |
logger.info(f"{output}")
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
title = "<h1 style='margin-bottom: -10px; text-align: center'>VCoder: Versatile Vision Encoders for Multimodal Large Language Models</h1>"
|
| 201 |
# style='
|
|
|
|
| 255 |
["Crop", "Resize", "Pad", "Default"],
|
| 256 |
value="Default",
|
| 257 |
label="Preprocess for non-square Seg Map", visible=False)
|
| 258 |
+
|
| 259 |
+
depthbox = gr.Image(type="pil", label="Depth Map")
|
| 260 |
+
depth_process_mode = gr.Radio(
|
| 261 |
+
["Crop", "Resize", "Pad", "Default"],
|
| 262 |
+
value="Default",
|
| 263 |
+
label="Preprocess for non-square Depth Map", visible=False)
|
| 264 |
|
| 265 |
with gr.Accordion("Parameters", open=False) as parameter_row:
|
| 266 |
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.8, step=0.1, interactive=True, label="Temperature",)
|
|
|
|
| 284 |
|
| 285 |
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
| 286 |
gr.Examples(examples=[
|
| 287 |
+
[f"{cur_dir}/examples/suits.jpg", f"{cur_dir}/examples/suits_pan.png", f"{cur_dir}/examples/suits_depth.jpeg", "Can you describe the depth order of the objects in this image, from closest to farthest?", "0.5", "0.5"],
|
| 288 |
+
], inputs=[imagebox, segbox, depthbox, textbox, temperature, top_p])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
|
| 290 |
if not embed_mode:
|
| 291 |
gr.Markdown(tos_markdown)
|
|
|
|
| 299 |
[state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
|
| 300 |
flag_btn.click(flag_last_response,
|
| 301 |
[state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
|
| 302 |
+
regenerate_btn.click(regenerate, [state, image_process_mode, seg_process_mode, depth_process_mode],
|
| 303 |
+
[state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list).then(
|
| 304 |
http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
|
| 305 |
[state, chatbot] + btn_list)
|
| 306 |
+
clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list)
|
| 307 |
|
| 308 |
+
textbox.submit(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode, depthbox, depth_process_mode], [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list
|
| 309 |
).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
|
| 310 |
[state, chatbot] + btn_list)
|
| 311 |
+
submit_btn.click(add_text, [state, textbox, imagebox, image_process_mode, segbox, seg_process_mode, depthbox, depth_process_mode], [state, chatbot, textbox, imagebox, segbox, depthbox] + btn_list
|
| 312 |
).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
|
| 313 |
[state, chatbot] + btn_list)
|
| 314 |
|