Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| from PIL import Image | |
| import base64 | |
| from io import BytesIO | |
| import numpy as np | |
| # βββ Page config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| st.set_page_config(page_title="ManyICLBench Leaderboard", layout="wide") | |
| logo_image = Image.open("src/manyicl_logo.png") | |
| def encode_image(image): | |
| buffered = BytesIO() | |
| image.save(buffered, format="PNG") | |
| return base64.b64encode(buffered.getvalue()).decode("utf-8") | |
| img_data = encode_image(logo_image) | |
| st.markdown( | |
| f""" | |
| <div class="logo-container" style="display:flex; justify-content: center; align-items: center; gap: 20px;"> | |
| <img src="data:image/png;base64,{img_data}" style="width:50%; max-width:700px;"/> | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| st.markdown( | |
| ''' | |
| <div class="header"> | |
| <br/> | |
| <p style="font-size:22px;"> | |
| ManyICLBench: Benchmarking Large Language Models' Long Context Capabilities with Many-Shot In-Context Learning | |
| </p> | |
| <p style="font-size:20px;"> | |
| π <a href="https://arxiv.org/abs/2411.07130">Paper</a> | π» <a href="https://github.com/launchnlp/ManyICLBench">GitHub</a> | π€ <a href="https://huggingface.co/datasets/launch/ManyICLBench/">Dataset</a> | | |
| βοΈ <strong>Version</strong>: <strong>V1</strong> | <strong># Models</strong>: 12 | Updated: <strong>June 2025</strong> | |
| </p> | |
| </div> | |
| ''', | |
| unsafe_allow_html=True | |
| ) | |
| # βββ Load data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_data(path): | |
| df = pd.read_csv(path) | |
| if 'Task' in df.columns: # Rename Task to Models for consistency | |
| df = df.rename(columns={'Task': 'Models'}) | |
| score_cols = ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000'] | |
| # Keep existing avg and avg.L columns | |
| # Compute rank per column (1 = best) | |
| for col in score_cols + ['avg', 'avg.L']: | |
| df[f"{col}_rank"] = df[col].rank(ascending=False, method="min").astype(int) | |
| return df | |
| # Add evaluation metrics explanation | |
| st.markdown("## π Evaluation Metrics") | |
| st.markdown(""" | |
| - **Per-length Performance**: Performance at different context lengths (1K to 128K tokens) | |
| - **avg**: Average performance across all context lengths | |
| - **avg.L**: Average performance on longer contexts (>32K tokens) | |
| Higher scores indicate better performance, with all metrics reported as percentages (0-100). | |
| Red indicates performance improvement compared to 1k. Blue indicates performance downgrade compared to 1k. A darker color means higher improvement or downgrade. | |
| """) | |
| def display_table(df, cols): | |
| # Precompute max values for Avg and Avg.L | |
| max_avg = df['avg'].max() | |
| max_avg_l = df['avg.L'].max() | |
| # Build raw HTML table | |
| html = "<table style='border-collapse:collapse; width:100%; font-size:14px;'>" | |
| # Format header labels | |
| html += "<tr>" | |
| for col in cols: | |
| style = "padding:6px;" | |
| label = "" | |
| if col in ['1000', '2000', '4000', '8000', '16000', '32000', '64000', '128000']: | |
| # Convert to K format | |
| val = int(col) // 1000 | |
| label = f"{val}K" | |
| else: | |
| label = col.title() # Capitalize first letter | |
| if col in ["Model", "Models"]: | |
| style += " width: 15%;" | |
| html += f"<th style='{style}'>{label}</th>" | |
| html += "</tr>" | |
| # rows | |
| for _, row in df.iterrows(): | |
| html += "<tr>" | |
| for col in cols: | |
| val = row[col] | |
| if col in ["Model", "Models"]: | |
| html += f"<td style='padding:6px; text-align:left; width: 15%;'>{val}</td>" | |
| else: | |
| # Format value | |
| val_str = f"{val:.1f}" if isinstance(val, (float, np.float64)) else val | |
| # Determine if this column should be colored | |
| if col in ['1000', 'avg', 'avg.L']: | |
| # No coloring for these columns, but add bolding for max values | |
| bold = "" | |
| if (col == 'avg' and val == max_avg) or \ | |
| (col == 'avg.L' and val == max_avg_l): | |
| bold = "font-weight:bold;" | |
| style = f"padding:6px; border: 1px solid #444; {bold}" | |
| else: | |
| # Calculate relative improvement from 1k baseline | |
| baseline = float(row['1000']) | |
| if baseline != 0: | |
| relative_change = float(val) / baseline - 1 # -1 to center at 0 | |
| # Clamp the change to a reasonable range for color scaling | |
| clamped_change = max(min(relative_change, 1.5), -0.5) | |
| # Normalize to 0-1 range where 0.5 is the neutral point (no change) | |
| if clamped_change < 0: | |
| # Map [-0.5, 0) to [0, 0.5) | |
| norm = clamped_change + 0.5 | |
| else: | |
| # Map [0, 1.5] to [0.5, 1.0] | |
| norm = 0.5 + (clamped_change / 3.0) | |
| # Color interpolation: | |
| # norm = 0 -> blue (100, 149, 237) | |
| # norm = 0.5 -> white (255, 255, 255) | |
| # norm = 1 -> red (220, 20, 60) | |
| if norm < 0.5: # Blue to White | |
| # Interpolate from blue to white | |
| factor = norm * 2 # 0 to 1 | |
| r = int(100 + (255 - 100) * factor) | |
| g = int(149 + (255 - 149) * factor) | |
| b = int(237 + (255 - 237) * factor) | |
| else: # White to Red | |
| # Interpolate from white to red | |
| factor = (norm - 0.5) * 2 # 0 to 1 | |
| r = int(255 - (255 - 220) * factor) | |
| g = int(255 - (255 - 20) * factor) | |
| b = int(255 - (255 - 60) * factor) | |
| style = f"background-color:rgba({r},{g},{b},0.8); padding:6px; border: 1px solid #444;" | |
| else: | |
| style = "padding:6px; border: 1px solid #444;" | |
| html += f"<td style='{style}'>{val_str}</td>" | |
| html += "</tr>" | |
| html += "</table>" | |
| st.markdown(html, unsafe_allow_html=True) | |
| # Display Retrieval table | |
| st.markdown("## SSL Tasks") | |
| st.markdown("Similar-sample Learning tasks require models to learn from a small set of similar demostration, therefore evaluating models' ability to retrieve similar samples.") | |
| df = load_data("src/Retrieval_full_200.csv") | |
| cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"] | |
| display_table(df, cols) | |
| # Display Global Context Understanding table | |
| st.markdown("## ASL Tasks") | |
| st.markdown("All-Sample Learning tasks require models to learn from all the demostrations, therefore evaluating models' ability to understand the global context.") | |
| df = load_data("src/Global Context Understanding_full_200.csv") | |
| cols = ["Models", "1000", "2000", "4000", "8000", "16000", "32000", "64000", "128000", "avg", "avg.L"] | |
| display_table(df, cols) | |
| st.markdown("## π Abstract") | |
| st.write( | |
| """ | |
| Many-shot in-context learning (ICL) has emerged as a unique setup to both utilize and test the ability of large language models to handle long context.This paper delves into long-context language model (LCLM) evaluation through many-shot ICL. We first ask: what types of ICL tasks benefit from additional demonstrations, and how effective are they in evaluating LCLMs? | |
| We find that classification and summarization tasks show performance improvements with additional demonstrations, while translation and reasoning tasks do not exhibit clear trends. | |
| Next, we investigate the extent to which different tasks necessitate retrieval versus global context understanding. | |
| We develop metrics to categorize ICL tasks into two groups: (i) similar-sample learning (**SSL**): tasks where retrieval of the most similar examples is sufficient for good performance, and (ii) all-sample learning (**ASL**): tasks that necessitate a deeper comprehension of all examples in the prompt. | |
| Lastly, we introduce a new many-shot ICL benchmark built on existing ICL tasks, ManyICLBench, to characterize model's ability on both fronts and benchmark 12 LCLMs using ManyICLBench. We find that while state-of-the-art models demonstrate good performance up to 64k tokens in SSL tasks, many models experience significant performance drops at only 16k tokens in ASL tasks. | |
| """ | |
| ) | |
| st.markdown("## Dataset Details") | |
| st.markdown(""" | |
| | **Dataset** | **Task Category** | **Avg. Tokens / Shot** | **Max # of Shots** | **# of Tasks** | | |
| | :--- | :--- | :--- | :--- | :--- | | |
| | BANKING77 | Intent Classification | 13.13 | 5386 | 1 | | |
| | GoEmotions | Emotion Classification | 15.85 | 5480 | 1 | | |
| | DialogRE | Relation Classification | 233.27 | 395 | 1 | | |
| | TREC | Question Classification | 11.25 | 6272 | 1 | | |
| | CLINC150 | Intent Classification | 8.95 | 7252 | 1 | | |
| | MATH | Math reasoning | [185.52, 407.90] | [286, 653] | 4 | | |
| | GSM8K | Math reasoning | 55.78 | 784 | 1 | | |
| | BBH | Reasoning | [48.27, 243.01] | [406, 2660] | 4 | | |
| | GPQA | MQ - Science | [183.55, 367.02] | [314, 580] | 1 | | |
| | ARC | MQ - Science | [61.54, 61.54] | [1997, 2301] | 2 | | |
| | XLSUM | New Summarization | 621.32 | 220 | 1 | | |
| GPT-4o tokenizer is used to calculate # of tokens. Max # of shots is the number of shots can be fitted into the 128k context window. For datasets that have multiple subtasks, we list the range for each value. | |
| **ASL Tasks**: banking77, dialogRE, TREC, CLINC150, and BBH_geometric_shapes | |
| **SSL Tasks**: GSM8K, MATH tasks, GSM8K, XLSUM, GPQA_cot, ARC_challenge, BBH-dyck_languages, BBH-salient_translation_error_detection, and BBH-word_sorting. | |
| """) | |
| st.markdown('## π€ Submit Your Model') | |
| st.write( | |
| """ | |
| π You can submit your model through the following link: [https://forms.gle/eWjzPDusDJSbXCCT7](https://forms.gle/eWjzPDusDJSbXCCT7) | |
| """ | |
| ) | |
| st.markdown("## π Citation") | |
| st.write(""" | |
| ```bibtex | |
| @article{zou2025manyshotincontextlearninglongcontext, | |
| title={On Many-Shot In-Context Learning for Long-Context Evaluation}, | |
| author={Kaijian Zou and Muhammad Khalifa and Lu Wang}, | |
| journal={arXiv preprint arXiv:2411.07130}, | |
| year={2025} | |
| } | |
| ``` | |
| """) | |