ManyICLBench_Leaderboard

Sleeping

App Files Files Community

ManyICLBench_Leaderboard / src /streamlit_app.py

zkjzou

update benchmark

239e0f7 6 months ago

raw

history blame contribute delete

10.8 kB

	import streamlit as st
	import pandas as pd
	from PIL import Image
	import base64
	from io import BytesIO
	import numpy as np

	# ─── Page config ──────────────────────────────────────────────────────────────
	st.set_page_config(page_title="ManyICLBench Leaderboard", layout="wide")
	logo_image = Image.open("src/manyicl_logo.png")


	def encode_image(image):
	buffered = BytesIO()
	image.save(buffered, format="PNG")
	return base64.b64encode(buffered.getvalue()).decode("utf-8")

	img_data = encode_image(logo_image)

	st.markdown(
	f"""
	<div class="logo-container" style="display:flex; justify-content: center; align-items: center; gap: 20px;">
	<img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/>
	</div>
	""",
	unsafe_allow_html=True
	)

	st.markdown(
	'''
	<div class="header">
	<br/>
	<p style="font-size:22px;">
	ManyICLBench: Benchmarking Large Language Models' Long Context Capabilities with Many-Shot In-Context Learning
	</p>
	<p style="font-size:20px;">
	📑 <a href="https://arxiv.org/abs/2411.07130">Paper</a> \| 💻 <a href="https://github.com/launchnlp/ManyICLBench">GitHub</a> \| 🤗 <a href="https://huggingface.co/datasets/launch/ManyICLBench/">Dataset</a> \|
	⚙️ <strong>Version</strong>: <strong>V1</strong> \| <strong># Models</strong>: 12 \| Updated: <strong>June 2025</strong>
	</p>
	</div>
	''',
	unsafe_allow_html=True
	)

	# ─── Load data ────────────────────────────────────────────────────────────────
	@st.cache_data
	def load_data(path):
	df = pd.read_csv(path)
	if 'Task' in df.columns: # Rename Task to Models for consistency
	df = df.rename(columns={'Task': 'Models'})
	score_cols = ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000']
	# Keep existing avg and avg.L columns
	# Compute rank per column (1 = best)
	for col in score_cols + ['avg', 'avg.L']:
	df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int)
	return df

	# Add evaluation metrics explanation
	st.markdown("## 📊 Evaluation Metrics")
	st.markdown("""
	- Per-length Performance: Performance at different context lengths (1K to 128K tokens)
	- avg: Average performance across all context lengths
	- avg.L: Average performance on longer contexts (>32K tokens)

	Higher scores indicate better performance, with all metrics reported as percentages (0-100).

	Red indicates performance improvement compared to 1k. Blue indicates performance downgrade compared to 1k. A darker color means higher improvement or downgrade.
	""")

	def display_table(df, cols):
	# Precompute max values for Avg and Avg.L
	max_avg = df['avg'].max()
	max_avg_l = df['avg.L'].max()

	# Build raw HTML table
	html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>"

	# Format header labels
	html += "<tr>"
	for col in cols:
	style = "padding:6px;"
	label = ""
	if col in ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000']:
	# Convert to K format
	val = int(col) // 1000
	label = f"{val}K"
	else:
	label = col.title() # Capitalize first letter

	if col in ["Model", "Models"]:
	style += " width: 15%;"

	html += f"<th style='{style}'>{label}</th>"
	html += "</tr>"

	# rows
	for _, row in df.iterrows():
	html += "<tr>"
	for col in cols:
	val = row[col]
	if col in ["Model", "Models"]:
	html += f"<td style='padding:6px; text-align:left; width: 15%;'>{val}</td>"
	else:
	# Format value
	val_str = f"{val:.1f}" if isinstance(val, (float, np.float64)) else val

	# Determine if this column should be colored
	if col in ['1000', 'avg', 'avg.L']:
	# No coloring for these columns, but add bolding for max values
	bold = ""
	if (col == 'avg' and val == max_avg) or \
	(col == 'avg.L' and val == max_avg_l):
	bold = "font-weight:bold;"
	style = f"padding:6px; border: 1px solid #444; {bold}"
	else:
	# Calculate relative improvement from 1k baseline
	baseline = float(row['1000'])
	if baseline != 0:
	relative_change = float(val) / baseline - 1 # -1 to center at 0
	# Clamp the change to a reasonable range for color scaling
	clamped_change = max(min(relative_change, 1.5), -0.5)

	# Normalize to 0-1 range where 0.5 is the neutral point (no change)
	if clamped_change < 0:
	# Map [-0.5, 0) to [0, 0.5)
	norm = clamped_change + 0.5
	else:
	# Map [0, 1.5] to [0.5, 1.0]
	norm = 0.5 + (clamped_change / 3.0)

	# Color interpolation:
	# norm = 0 -> blue (100, 149, 237)
	# norm = 0.5 -> white (255, 255, 255)
	# norm = 1 -> red (220, 20, 60)

	if norm < 0.5: # Blue to White
	# Interpolate from blue to white
	factor = norm * 2 # 0 to 1
	r = int(100 + (255 - 100) * factor)
	g = int(149 + (255 - 149) * factor)
	b = int(237 + (255 - 237) * factor)
	else: # White to Red
	# Interpolate from white to red
	factor = (norm - 0.5) * 2 # 0 to 1
	r = int(255 - (255 - 220) * factor)
	g = int(255 - (255 - 20) * factor)
	b = int(255 - (255 - 60) * factor)

	style = f"background-color:rgba({r},{g},{b},0.8); padding:6px; border: 1px solid #444;"
	else:
	style = "padding:6px; border: 1px solid #444;"

	html += f"<td style='{style}'>{val_str}</td>"
	html += "</tr>"
	html += "</table>"
	st.markdown(html, unsafe_allow_html=True)

	# Display Retrieval table
	st.markdown("## SSL Tasks")
	st.markdown("Similar-sample Learning tasks require models to learn from a small set of similar demostration, therefore evaluating models' ability to retrieve similar samples.")
	df = load_data("src/Retrieval_full_200.csv")
	cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"]
	display_table(df, cols)

	# Display Global Context Understanding table
	st.markdown("## ASL Tasks")
	st.markdown("All-Sample Learning tasks require models to learn from all the demostrations, therefore evaluating models' ability to understand the global context.")
	df = load_data("src/Global Context Understanding_full_200.csv")
	cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"]
	display_table(df, cols)

	st.markdown("## 📚 Abstract")
	st.write(
	"""
	Many-shot in-context learning (ICL) has emerged as a unique setup to both utilize and test the ability of large language models to handle long context.This paper delves into long-context language model (LCLM) evaluation through many-shot ICL. We first ask: what types of ICL tasks benefit from additional demonstrations, and how effective are they in evaluating LCLMs?
	We find that classification and summarization tasks show performance improvements with additional demonstrations, while translation and reasoning tasks do not exhibit clear trends.
	Next, we investigate the extent to which different tasks necessitate retrieval versus global context understanding.
	We develop metrics to categorize ICL tasks into two groups: (i) similar-sample learning (SSL): tasks where retrieval of the most similar examples is sufficient for good performance, and (ii) all-sample learning (ASL): tasks that necessitate a deeper comprehension of all examples in the prompt.
	Lastly, we introduce a new many-shot ICL benchmark built on existing ICL tasks, ManyICLBench, to characterize model's ability on both fronts and benchmark 12 LCLMs using ManyICLBench. We find that while state-of-the-art models demonstrate good performance up to 64k tokens in SSL tasks, many models experience significant performance drops at only 16k tokens in ASL tasks.
	"""
	)
	st.markdown("## Dataset Details")
	st.markdown("""
	\| Dataset \| Task Category \| Avg. Tokens / Shot \| Max # of Shots \| # of Tasks \|
	\| :--- \| :--- \| :--- \| :--- \| :--- \|
	\| BANKING77 \| Intent Classification \| 13.13 \| 5386 \| 1 \|
	\| GoEmotions \| Emotion Classification \| 15.85 \| 5480 \| 1 \|
	\| DialogRE \| Relation Classification \| 233.27 \| 395 \| 1 \|
	\| TREC \| Question Classification \| 11.25 \| 6272 \| 1 \|
	\| CLINC150 \| Intent Classification \| 8.95 \| 7252 \| 1 \|
	\| MATH \| Math reasoning \| [185.52, 407.90] \| [286, 653] \| 4 \|
	\| GSM8K \| Math reasoning \| 55.78 \| 784 \| 1 \|
	\| BBH \| Reasoning \| [48.27, 243.01] \| [406, 2660] \| 4 \|
	\| GPQA \| MQ - Science \| [183.55, 367.02] \| [314, 580] \| 1 \|
	\| ARC \| MQ - Science \| [61.54, 61.54] \| [1997, 2301] \| 2 \|
	\| XLSUM \| New Summarization \| 621.32 \| 220 \| 1 \|

	GPT-4o tokenizer is used to calculate # of tokens. Max # of shots is the number of shots can be fitted into the 128k context window. For datasets that have multiple subtasks, we list the range for each value.

	ASL Tasks: banking77, dialogRE, TREC, CLINC150, and BBH_geometric_shapes

	SSL Tasks: GSM8K, MATH tasks, GSM8K, XLSUM, GPQA_cot, ARC_challenge, BBH-dyck_languages, BBH-salient_translation_error_detection, and BBH-word_sorting.

	""")
	st.markdown('## 🤖 Submit Your Model')
	st.write(
	"""
	👉 You can submit your model through the following link: [https://forms.gle/eWjzPDusDJSbXCCT7](https://forms.gle/eWjzPDusDJSbXCCT7)
	"""
	)

	st.markdown("## 📚 Citation")
	st.write("""
	```bibtex
	@article{zou2025manyshotincontextlearninglongcontext,
	title={On Many-Shot In-Context Learning for Long-Context Evaluation},
	author={Kaijian Zou and Muhammad Khalifa and Lu Wang},
	journal={arXiv preprint arXiv:2411.07130},
	year={2025}
	}
	```
	""")