{
  "asOf": "2026-06-29",
  "rows": [
    {
      "name": "SWE-bench",
      "slug": "swe-bench",
      "aliases": [
        "SWE-bench Full"
      ],
      "category": "coding-agentic",
      "measures": "Whether a system can resolve a real GitHub issue by generating a patch that passes the repository's hidden tests.",
      "maker": "Princeton and Stanford (Jimenez, Yang, Yao et al.)",
      "year": 2023,
      "format": "2,294 real GitHub issue and merged-PR pairs across 12 popular Python repos (also subsetted into Lite: 300, Verified: 500)",
      "metric": "% resolved (pass@1)",
      "state": "saturated",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://www.swebench.com/",
      "sourceUrl": "https://arxiv.org/abs/2310.06770",
      "note": "The original 2,294-task Full set is rarely reported now: the community moved to the human-validated Verified subset and then to harder, contamination-resistant variants. The launch paper for the whole SWE-bench family.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "SWE-bench Verified",
      "slug": "swe-bench-verified",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "The same real-GitHub-issue resolution task as SWE-bench, restricted to a human-validated subset where the issue is solvable and the tests are not broken.",
      "maker": "OpenAI (with the SWE-bench authors)",
      "year": 2024,
      "format": "500 human-validated task instances drawn from SWE-bench Full (Python)",
      "metric": "% resolved (pass@1)",
      "state": "saturated",
      "sotaScore": "~95%",
      "sotaModel": "Claude Fable 5",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://www.swebench.com/",
      "sourceUrl": "https://openai.com/index/introducing-swe-bench-verified/",
      "note": "OpenAI announced in Feb 2026 that it stopped reporting Verified after finding broken tests and training-data exposure, and now recommends SWE-bench Pro. Treat scores near the ceiling as a contamination signal, not clean capability.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "SWE-bench Pro",
      "slug": "swe-bench-pro",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Whether agents can solve long-horizon, enterprise-grade software-engineering tasks under standardized scaffolding, designed to resist contamination.",
      "maker": "Scale AI (Scale Labs)",
      "year": 2025,
      "format": "1,865 problems across 41 maintained repos, with public, held-out, and commercial (proprietary) splits; Python, Go, TypeScript and JavaScript",
      "metric": "% resolved (pass@1) under standardized agent scaffolding",
      "state": "active",
      "sotaScore": "59.1% (public set)",
      "sotaModel": "GPT-5.4 (xHigh)",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://labs.scale.com/leaderboard/swe_bench_pro_public",
      "sourceUrl": "https://arxiv.org/abs/2509.16941",
      "note": "Held-out and commercial splits plus standardized scaffolding sharply lower scores: models near 80% on Verified land in the 45 to 60% range here. OpenAI now recommends reporting Pro instead of Verified.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "SWE-bench Multimodal",
      "slug": "swe-bench-multimodal",
      "aliases": [
        "SWE-bench M"
      ],
      "category": "coding-agentic",
      "measures": "Whether coding agents can resolve real GitHub issues in visual, user-facing JavaScript software where the bug or feature involves the UI.",
      "maker": "Stanford and Princeton (Yang, Jimenez et al.)",
      "year": 2024,
      "format": "517 test instances with visual elements, drawn from 17 user-facing JavaScript repos (web UI, data viz, mapping)",
      "metric": "% resolved (pass@1)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://www.swebench.com/multimodal.html",
      "sourceUrl": "https://arxiv.org/abs/2410.03859",
      "note": "Tests whether SWE agents generalize from Python to visual JS domains. At release even strong systems resolved only about 12%, since most agents lack image-handling pipelines.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "DeepSWE",
      "slug": "deepswe",
      "aliases": [
        "DeepSWE v1.1"
      ],
      "category": "coding-agentic",
      "measures": "Whether frontier coding agents can complete original, long-horizon engineering tasks written from scratch, with no upstream PR to memorize.",
      "maker": "Datacurve",
      "year": 2026,
      "format": "113 original long-horizon tasks across 91 active repos and 5 languages (TypeScript, Go, Python, JavaScript, Rust), graded by program-based verifiers in isolated environments",
      "metric": "pass@1 (committed code graded in a clean environment)",
      "state": "active",
      "sotaScore": "~70%",
      "sotaModel": "Claude Fable 5 (max)",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://deepswe.datacurve.ai/",
      "sourceUrl": "https://github.com/datacurve-ai/deep-swe",
      "note": "Tasks are scratch-written with no upstream references, so models cannot memorize solutions, which spreads model scores apart again. See our deep dive on DeepSWE vs FrontierCode for how completion and quality diverge.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "FrontierCode",
      "slug": "frontiercode",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Whether a coding agent produces a mergeable, production-quality pull request, not just one that passes tests, judged on correctness, regression safety, scope, tests and style.",
      "maker": "Cognition (with 20+ open-source maintainers)",
      "year": 2026,
      "format": "150 maintainer-authored tasks across 36 flagship repos, nested into Diamond (50 hardest), Main (100) and Extended (150); 40+ expert hours per task",
      "metric": "Pass rate on blocker criteria plus a weighted six-dimension quality rubric",
      "state": "active",
      "sotaScore": "13.4% (Diamond)",
      "sotaModel": "Claude Opus 4.8",
      "sotaDate": "2026-06",
      "leaderboardUrl": "",
      "sourceUrl": "https://cognition.com/blog/frontier-code",
      "note": "Announced via Cognition's vendor blog (June 2026), not peer-reviewed; tasks are kept private to resist contamination. It asks would a maintainer merge this, so even the best model clears only about one in eight of the hardest tasks.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Terminal-Bench",
      "slug": "terminal-bench",
      "aliases": [
        "Terminal-Bench 2.0",
        "T-Bench"
      ],
      "category": "coding-agentic",
      "measures": "Whether an AI agent can complete hard, realistic command-line tasks (build, configure, train, debug, secure) end to end inside a real terminal.",
      "maker": "Stanford and the Laude Institute",
      "year": 2026,
      "format": "89 human-verified containerized tasks (v2.0) spanning software engineering, sysadmin, data science, ML and security",
      "metric": "Pass/fail, graded by verification scripts in the agent's Docker environment (pass@1)",
      "state": "active",
      "sotaScore": "~82%",
      "sotaModel": "Codex (GPT-5.5)",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://www.tbench.ai/",
      "sourceUrl": "https://arxiv.org/abs/2601.11868",
      "note": "Grades strictly pass/fail by executing verification scripts in a real terminal, so an agent must actually reach the end state, which is hard to fake. Scores reflect the scaffold and model together.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "SWE-Lancer",
      "slug": "swe-lancer",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Whether frontier models can complete real paid freelance software jobs, both coding and technical-management tasks, well enough to earn the payouts.",
      "maker": "OpenAI (Miserendino, Patwardhan et al.)",
      "year": 2025,
      "format": "1,400+ real Upwork freelance tasks worth $1M in payouts, from $50 bug fixes to $32k features, validated by engineer-written end-to-end tests",
      "metric": "Dollars earned (and % of tasks resolved)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2502.12115",
      "note": "Grounds capability in real economic value rather than synthetic accuracy, and uses end-to-end tests to resist gaming. At release, frontier models earned only a fraction of the $1M.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Aider Polyglot",
      "slug": "aider-polyglot",
      "aliases": [
        "Aider Polyglot Benchmark"
      ],
      "category": "coding-agentic",
      "measures": "How well a model writes and correctly edits code across many languages, including applying diffs in the right format and self-correcting after test failures.",
      "maker": "Aider (Paul Gauthier)",
      "year": 2024,
      "format": "225 of the hardest Exercism exercises across C++, Go, Java, JavaScript, Python and Rust; two attempts per problem",
      "metric": "Percent correct after the second attempt, plus percent using the correct edit format",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://aider.chat/docs/leaderboards/",
      "sourceUrl": "https://aider.chat/2024/12/21/polyglot.html",
      "note": "Replaced Aider's near-saturated single-language edit benchmark with a deliberately harder polyglot set, and rewards real edit-format compliance, not just code that looks right.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "LiveCodeBench",
      "slug": "livecodebench",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Code generation and related skills (self-repair, execution, test-output prediction) on fresh competitive-programming problems, designed to be contamination-free.",
      "maker": "UC Berkeley, MIT and Cornell (Jain, Han et al.)",
      "year": 2024,
      "format": "Continuously harvested LeetCode, AtCoder and Codeforces problems (1,000+), each time-stamped so a model is scored only on problems released after its training cutoff",
      "metric": "pass@1",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://livecodebench.github.io/leaderboard.html",
      "sourceUrl": "https://arxiv.org/abs/2403.07974",
      "note": "Its core defense is time-stamping: by scoring only on post-cutoff problems it neutralizes contamination, the main weakness of static code benchmarks.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "BigCodeBench",
      "slug": "bigcodebench",
      "aliases": [
        "BigCodeBench-Complete",
        "BigCodeBench-Instruct"
      ],
      "category": "coding-agentic",
      "measures": "Whether models can write code that correctly invokes multiple function calls from diverse real libraries to satisfy complex, practical instructions.",
      "maker": "BigCode project (Zhuo et al.)",
      "year": 2024,
      "format": "1,140 tasks calling across 139 libraries in 7 domains, with about 99% branch coverage; Complete (completion) and Instruct (instruction) splits",
      "metric": "pass@1 against rigorous per-task test suites",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard",
      "sourceUrl": "https://arxiv.org/abs/2406.15877",
      "note": "Targets practical, library-heavy programming rather than self-contained puzzles, so it is harder to saturate than HumanEval or MBPP; high test coverage limits lucky passes.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "RepoBench",
      "slug": "repobench",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Repository-level code auto-completion: retrieving relevant cross-file context, predicting the next line, and the combined retrieval-plus-completion pipeline.",
      "maker": "Liu, Xu and McAuley (UC San Diego)",
      "year": 2023,
      "format": "Multi-file tasks from Python and Java repos, split into retrieval (RepoBench-R), completion (RepoBench-C) and pipeline (RepoBench-P)",
      "metric": "Retrieval accuracy and exact-match / edit similarity for next-line completion",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2306.03091",
      "note": "Isolates cross-file, repo-level context handling rather than single-function generation. Reported per paper rather than via a central live leaderboard.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Multi-SWE-bench",
      "slug": "multi-swe-bench",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Cross-language issue resolution: whether agents can resolve real GitHub issues with a passing patch across many languages beyond Python.",
      "maker": "ByteDance (ByteDance Seed)",
      "year": 2025,
      "format": "1,632 human-annotated issue-resolving instances across 7 languages (Java, TypeScript, JavaScript, Go, Rust, C, C++)",
      "metric": "% resolved (pass@1)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://multi-swe-bench.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2504.02605",
      "note": "Extends the SWE-bench paradigm out of Python-only into 7 languages, exposing that issue-resolution skill does not transfer evenly across languages.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "HumanEval",
      "slug": "humaneval",
      "aliases": [],
      "category": "coding-agentic",
      "measures": "Whether a model can synthesize a single correct Python function from a docstring so that it passes the provided unit tests.",
      "maker": "OpenAI (Chen et al.)",
      "year": 2021,
      "format": "164 hand-written Python problems, each with a signature, docstring and hidden unit tests",
      "metric": "pass@k (primarily pass@1)",
      "state": "saturated",
      "sotaScore": "~99%",
      "sotaModel": "Frontier models broadly",
      "sotaDate": "2025-04",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2107.03374",
      "note": "Effectively solved: frontier models sit near 99% pass@1, so it no longer separates strong models and is widely suspected of contamination. Kept only as a historical baseline.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MBPP",
      "slug": "mbpp",
      "aliases": [
        "Mostly Basic Python Problems"
      ],
      "category": "coding-agentic",
      "measures": "Whether a model can generate short, entry-level Python functions from a natural-language prompt that pass the provided tests.",
      "maker": "Google Research (Austin, Odena et al.)",
      "year": 2021,
      "format": "974 crowd-sourced entry-level Python problems, each with a prompt and about 3 tests",
      "metric": "pass@1",
      "state": "saturated",
      "sotaScore": "~95%+",
      "sotaModel": "Frontier models broadly",
      "sotaDate": "2026-06",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2108.07732",
      "note": "Saturated and partly memorized: many test items appear on open-access sites, so researchers moved to MBPP+ with stronger tests. Kept as a historical baseline.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "GAIA",
      "slug": "gaia",
      "aliases": [
        "General AI Assistants benchmark"
      ],
      "category": "agentic-tooluse",
      "measures": "Whether an AI assistant can answer real-world questions that require multi-step reasoning, multiple modalities, web browsing and general tool use.",
      "maker": "Meta AI and Hugging Face (Mialon, Fourrier et al.)",
      "year": 2023,
      "format": "466 real-world questions across 3 difficulty levels (165 public validation, about 300 held-out test), each needing tools or browsing and a single answer",
      "metric": "Exact-match accuracy against an unambiguous answer",
      "state": "active",
      "sotaScore": "~75%",
      "sotaModel": "HAL agent (Claude Sonnet 4.5)",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://hal.cs.princeton.edu/gaia",
      "sourceUrl": "https://arxiv.org/abs/2311.12983",
      "note": "Held-out test answers are private and submission-graded, which limits contamination. The Princeton HAL board now reframes GAIA around agent reliability and cost, not just raw accuracy.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "tau-bench",
      "slug": "tau-bench",
      "aliases": [
        "τ-bench",
        "tau2-bench",
        "τ2-bench"
      ],
      "category": "agentic-tooluse",
      "measures": "Whether a tool-using agent can reliably complete customer-service tasks over multi-turn conversations with a simulated user while obeying domain policies.",
      "maker": "Sierra (Yao, Shinn, Narasimhan et al.)",
      "year": 2024,
      "format": "165 tasks in v1 (115 retail, 50 airline) as dynamic dialogues with a simulated user plus domain APIs; later versions add telecom and banking",
      "metric": "pass^k: the probability an agent succeeds across all k independent trials (reliability, not just average success)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2406.12045",
      "note": "Designed to expose unreliability: even strong function-calling models pass^1 well under 70% on retail and about 35 to 46% on airline, and pass^8 is far lower.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "AgentBench",
      "slug": "agentbench",
      "aliases": [],
      "category": "agentic-tooluse",
      "measures": "How well an LLM acts as an autonomous agent in multi-turn, open-ended decision-making across diverse interactive environments.",
      "maker": "Tsinghua University (THUDM; Liu et al.)",
      "year": 2023,
      "format": "8 interactive environments (operating system, database, knowledge graph, card game, puzzles, household, web shopping, web browsing)",
      "metric": "Per-environment success aggregated into an overall score",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://llmbench.ai/agent",
      "sourceUrl": "https://arxiv.org/abs/2308.03688",
      "note": "One of the earliest broad agentic suites; it showed a large gap between top commercial and open-source models. Now somewhat dated relative to newer computer-use and web-agent benchmarks.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "WebArena",
      "slug": "webarena",
      "aliases": [],
      "category": "agentic-tooluse",
      "measures": "Whether an autonomous agent can complete long-horizon, realistic web tasks (navigation, forms, multi-step workflows) in fully functional self-hosted websites.",
      "maker": "Carnegie Mellon University (Zhou, Xu et al.)",
      "year": 2023,
      "format": "812 long-horizon tasks across self-hosted sites: e-commerce, a social forum, GitLab, a CMS, plus a map and a wiki",
      "metric": "Functional success rate via execution-based reward checking the end state",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://webarena.dev/",
      "sourceUrl": "https://arxiv.org/abs/2307.13854",
      "note": "Paper baseline: the best GPT-4 agent reached only 14.4% versus 78.2% for humans, setting the headroom that drove later web-agent work. Self-hosting limits contamination, but harnesses vary, so cross-paper numbers are not always comparable.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "VisualWebArena",
      "slug": "visualwebarena",
      "aliases": [
        "VWA"
      ],
      "category": "agentic-tooluse",
      "measures": "Whether a multimodal agent can complete visually grounded web tasks that require interpreting images and page layout, not just text.",
      "maker": "Carnegie Mellon University (Koh et al.)",
      "year": 2024,
      "format": "910 visually grounded tasks across three self-hosted environments (Classifieds, Shopping, Reddit)",
      "metric": "Functional success rate via execution-based evaluation",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://jykoh.com/vwa",
      "sourceUrl": "https://arxiv.org/abs/2401.13649",
      "note": "Multimodal extension of WebArena. Paper baseline: the best agent reached about 16% versus about 89% for humans, showing weak visual grounding in web agents.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "OSWorld",
      "slug": "osworld",
      "aliases": [
        "OSWorld-Verified"
      ],
      "category": "agentic-tooluse",
      "measures": "Whether a multimodal agent can operate a real computer (desktop apps, file I/O, multi-app workflows) to complete open-ended tasks in a live virtual machine.",
      "maker": "XLANG Lab, University of Hong Kong (Xie et al.)",
      "year": 2024,
      "format": "369 real computer-use tasks spanning web and desktop apps, file I/O and cross-application workflows, with an OSWorld-Verified track",
      "metric": "Execution-based success rate via per-task verification scripts that inspect machine state",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://os-world.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2404.07972",
      "note": "The leading live computer-use benchmark. Execution-based scoring in a real VM makes contamination hard but introduces environment-drift and flakiness concerns.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "BrowseComp",
      "slug": "browsecomp",
      "aliases": [
        "Browsing Competition"
      ],
      "category": "agentic-tooluse",
      "measures": "Whether a browsing agent can persistently navigate the open web to locate a single hard-to-find, entangled fact.",
      "maker": "OpenAI (Wei, Sun et al.)",
      "year": 2025,
      "format": "1,266 short-answer questions, each with a single answer that is hard to find but easy to verify",
      "metric": "Accuracy via model-graded semantic equivalence to the reference answer",
      "state": "active",
      "sotaScore": "51.5%",
      "sotaModel": "OpenAI Deep Research (launch paper)",
      "sotaDate": "2025-04",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2504.12516",
      "note": "The dataset is canary-stringed and not posted in plaintext to limit leakage, and questions are verified unsolvable by earlier models. The 51.5% figure is the paper's own best result, likely surpassed since but not via a maintained board.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MLE-bench",
      "slug": "mle-bench",
      "aliases": [],
      "category": "agentic-tooluse",
      "measures": "Whether an AI agent can do end-to-end machine-learning engineering (data prep, training, experimentation, submission) at the level of human Kaggle competitors.",
      "maker": "OpenAI (Chan et al.)",
      "year": 2024,
      "format": "75 ML-engineering Kaggle competitions; the agent produces submissions scored against each competition's real leaderboard",
      "metric": "Medal rate (fraction of competitions reaching bronze/silver/gold thresholds)",
      "state": "active",
      "sotaScore": "16.9% (paper baseline)",
      "sotaModel": "o1-preview with AIDE scaffolding",
      "sotaDate": "2024-10",
      "leaderboardUrl": "https://github.com/openai/mle-bench",
      "sourceUrl": "https://arxiv.org/abs/2410.07095",
      "note": "Uses real Kaggle competitions, so contamination is a stated risk the paper checks for. The official board paused new submissions in 2026 pending a fairer process, so 16.9% is the paper baseline, not a current frontier figure.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "ARC-AGI-1",
      "slug": "arc-agi-1",
      "aliases": [
        "ARC-AGI",
        "Abstraction and Reasoning Corpus"
      ],
      "category": "reasoning",
      "measures": "Whether a system can infer the abstract rule of a novel visual grid puzzle from a few examples and apply it to a new input.",
      "maker": "Francois Chollet (ARC Prize Foundation)",
      "year": 2019,
      "format": "1,000 grid-based reasoning tasks across public, semi-private and private eval sets, each giving a few input-output examples and a test input",
      "metric": "pass@2 exact-grid-match accuracy",
      "state": "active",
      "sotaScore": "87.5% (high compute)",
      "sotaModel": "OpenAI o3-preview",
      "sotaDate": "2024-12",
      "leaderboardUrl": "https://arcprize.org/leaderboard",
      "sourceUrl": "https://arcprize.org/arc-agi/1",
      "note": "From Chollet's 2019 paper On the Measure of Intelligence; it resisted AI until late 2024, when o3-preview cleared the 85% target at very high compute cost. Now largely beaten, which is why ARC-AGI-2 was built.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "ARC-AGI-2",
      "slug": "arc-agi-2",
      "aliases": [
        "ARC-AGI v2"
      ],
      "category": "reasoning",
      "measures": "The same fluid-intelligence test as ARC-AGI-1, but with harder, contamination-resistant tasks that stay easy for humans yet very hard for AI.",
      "maker": "ARC Prize Foundation (Chollet et al.)",
      "year": 2025,
      "format": "1,240 grid tasks (1,000 training, 120 public, 120 semi-private, 120 private eval), each solvable pass@2 by at least two humans",
      "metric": "pass@2 exact-grid-match accuracy, reported with a cost-per-task efficiency metric",
      "state": "active",
      "sotaScore": "54% (verified)",
      "sotaModel": "Poetiq (Gemini-based solver)",
      "sotaDate": "2025-12",
      "leaderboardUrl": "https://arcprize.org/leaderboard",
      "sourceUrl": "https://arcprize.org/arc-agi/2",
      "note": "Far from solved: the verified semi-private SOTA is 54% versus an 85% target. It uses unpublished semi-private and private eval sets to resist contamination, so only ARC-Prize-verified numbers should be trusted over self-reported ones.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "GPQA Diamond",
      "slug": "gpqa-diamond",
      "aliases": [
        "GPQA-Diamond"
      ],
      "category": "reasoning",
      "measures": "Graduate and PhD-level multiple-choice scientific reasoning in biology, physics and chemistry, on questions designed to be unanswerable by quick web search.",
      "maker": "Rein et al. (NYU, Cohere, Anthropic)",
      "year": 2023,
      "format": "198 expert-written four-option questions (the hardest, highest-agreement subset of the 448-question GPQA set)",
      "metric": "Multiple-choice accuracy (random baseline 25%, PhD-expert baseline about 70%)",
      "state": "saturated",
      "sotaScore": "~94%",
      "sotaModel": "Gemini 3.1 Pro Preview",
      "sotaDate": "2026-02",
      "leaderboardUrl": "https://epoch.ai/benchmarks/gpqa-diamond",
      "sourceUrl": "https://arxiv.org/abs/2311.12022",
      "note": "The Google-proof design tests reasoning over retrieval. By 2026 top models exceed the human-expert baseline and sit in the low-to-mid 90s, so it is effectively saturated. Only 198 items, so a few questions swing the score.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Humanity's Last Exam",
      "slug": "humanitys-last-exam",
      "aliases": [
        "HLE"
      ],
      "category": "reasoning",
      "measures": "Frontier, closed-ended expert knowledge and reasoning across more than 100 academic disciplines at the limit of human expertise.",
      "maker": "Center for AI Safety (CAIS) and Scale AI",
      "year": 2025,
      "format": "2,500 public expert-level questions (text and multimodal) across 100+ subjects, mostly short-answer and multiple-choice, plus a private held-out set",
      "metric": "Accuracy (exact match / multiple-choice), often reported with a calibration metric",
      "state": "active",
      "sotaScore": "53.3%",
      "sotaModel": "Claude Fable 5 (Max Effort)",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://artificialanalysis.ai/evaluations/humanitys-last-exam",
      "sourceUrl": "https://arxiv.org/abs/2501.14249",
      "note": "Released Jan 2025 by CAIS and Scale AI and published in Nature in Jan 2026. Designed as a very hard multi-domain expert exam where early-2025 models scored single digits; still unsaturated at about 53%, with a private holdout to detect overfitting.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "BIG-Bench Hard",
      "slug": "big-bench-hard",
      "aliases": [
        "BBH"
      ],
      "category": "reasoning",
      "measures": "A suite of multi-step reasoning tasks (logic, arithmetic, algorithmic, commonsense) on which pre-2022 models trailed average human raters.",
      "maker": "Suzgun et al. (Google Research and Stanford)",
      "year": 2022,
      "format": "23 tasks drawn from BIG-Bench (about 6,500 examples) spanning logical, arithmetic, commonsense and algorithmic reasoning",
      "metric": "Per-task accuracy averaged across the 23 tasks",
      "state": "saturated",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://github.com/suzgunmirac/BIG-Bench-Hard",
      "note": "The canonical chain-of-thought demonstrator and a standard reasoning suite. Frontier models now score near the ceiling, so it is effectively saturated and superseded by BIG-Bench Extra Hard.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MuSR",
      "slug": "musr",
      "aliases": [
        "Multistep Soft Reasoning"
      ],
      "category": "reasoning",
      "measures": "Multistep commonsense reasoning embedded in long natural-language narratives such as murder mysteries, object placement and team allocation.",
      "maker": "Sprague, Ye, Durrett et al. (UT Austin)",
      "year": 2023,
      "format": "Algorithmically generated long narratives (about 1,000 words) across three domains, with multiple-choice reasoning questions",
      "metric": "Multiple-choice accuracy",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/musr",
      "sourceUrl": "https://arxiv.org/abs/2310.16049",
      "note": "Its neurosymbolic synthetic-to-natural generation makes it cheap to regenerate and harder to memorize than fixed sets. Used as a component of the Hugging Face Open LLM Leaderboard v2.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "FrontierMath",
      "slug": "frontiermath",
      "aliases": [],
      "category": "math",
      "measures": "Research-level original mathematics requiring hours to days of expert effort, across number theory, analysis, algebraic geometry and more.",
      "maker": "Epoch AI",
      "year": 2024,
      "format": "338 original, unpublished problems (after the June 2026 v2 correction): 295 in Tiers 1 to 3 plus 43 exceptionally hard Tier 4 problems, each with a verifiable answer",
      "metric": "Accuracy (fraction with a correct, automatically verifiable final answer)",
      "state": "active",
      "sotaScore": "52.4%",
      "sotaModel": "GPT-5.5 Pro",
      "sotaDate": "2026-05",
      "leaderboardUrl": "https://epoch.ai/benchmarks/frontiermath",
      "sourceUrl": "https://epoch.ai/frontiermath",
      "note": "Vetted by expert mathematicians and kept largely held back to prevent contamination. Epoch discloses it was funded by OpenAI, who has exclusive access to a subset, a notable conflict of interest. A v2 update corrected errors in 42% of problems, so pre-v2 scores are not comparable.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "AIME 2025",
      "slug": "aime-2025",
      "aliases": [
        "AIME (LLM eval)"
      ],
      "category": "math",
      "measures": "Olympiad-track competition mathematics at the level of the American Invitational Mathematics Examination, used as a high-difficulty LLM eval.",
      "maker": "Mathematical Association of America; adopted as an LLM eval by the community",
      "year": 2025,
      "format": "30 problems (AIME I and II 2025), each with an integer answer from 000 to 999",
      "metric": "Exact-match accuracy, usually pass@1 averaged over samples",
      "state": "saturated",
      "sotaScore": "100%",
      "sotaModel": "Multiple frontier reasoning models",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://matharena.ai/",
      "sourceUrl": "https://matharena.ai/",
      "note": "Fresh contest problems mitigate contamination only until they circulate online. By mid-2026 several top reasoning models hit a perfect 30/30, so it is saturated, and the integer-answer format allows lucky guesses on individual items.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MATH",
      "slug": "math",
      "aliases": [
        "Hendrycks MATH",
        "MATH-500"
      ],
      "category": "math",
      "measures": "Step-by-step solving of high-school competition mathematics across algebra, geometry, number theory, probability and precalculus.",
      "maker": "Hendrycks et al. (UC Berkeley)",
      "year": 2021,
      "format": "12,500 competition problems with worked solutions; the 500-item MATH-500 subset is the common modern eval",
      "metric": "Exact-match accuracy on the final boxed answer",
      "state": "saturated",
      "sotaScore": "~99% (MATH-500)",
      "sotaModel": "GPT-5",
      "sotaDate": "2026-04",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/math-500",
      "sourceUrl": "https://arxiv.org/abs/2103.03874",
      "note": "Once a frontier benchmark, now largely saturated, with top models near 99% on MATH-500, so it serves mainly as a smoke test. As a fixed public set it carries real contamination risk.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "GSM8K",
      "slug": "gsm8k",
      "aliases": [
        "Grade School Math 8K"
      ],
      "category": "math",
      "measures": "Multi-step grade-school arithmetic word-problem reasoning.",
      "maker": "OpenAI (Cobbe et al.)",
      "year": 2021,
      "format": "8,500 grade-school word problems (7,500 train, 1,000 test), each solvable in a few elementary steps",
      "metric": "Exact-match accuracy on the final numeric answer",
      "state": "saturated",
      "sotaScore": "~99.6%",
      "sotaModel": "Frontier models broadly",
      "sotaDate": "2026-05",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/gsm8k",
      "sourceUrl": "https://arxiv.org/abs/2110.14168",
      "note": "Fully saturated and now used mainly as a smoke test. The GSM-Symbolic and GSM1k follow-ups showed some apparent gains reflect contamination and template memorization rather than robustness.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Omni-MATH",
      "slug": "omni-math",
      "aliases": [],
      "category": "math",
      "measures": "Olympiad-level mathematical reasoning across a broad range of subdomains and difficulty levels.",
      "maker": "Gao, Song, Cai et al. (Peking University and collaborators)",
      "year": 2024,
      "format": "4,428 human-annotated competition problems across 33+ subdomains and 10+ difficulty levels",
      "metric": "Accuracy, scored with an LLM-based verifier (Omni-Judge)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://omni-math.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2410.07985",
      "note": "At release even o1-preview scored only about 53 to 61%, so it was meaningfully unsaturated. As a published static set it carries contamination risk for newer models.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MathArena",
      "slug": "matharena",
      "aliases": [
        "MathArena.ai"
      ],
      "category": "math",
      "measures": "Mathematical reasoning and proof-writing on freshly released competition problems, evaluated before they can enter training data.",
      "maker": "ETH Zurich (SRI Lab)",
      "year": 2025,
      "format": "A rolling set of recent competitions (AIME, HMMT, USAMO, IMO, Putnam and others), evaluated as each contest is released; final-answer problems auto-graded, proofs expert-graded",
      "metric": "Per-competition accuracy and an aggregate expected-performance score",
      "state": "active",
      "sotaScore": "81.1% (aggregate)",
      "sotaModel": "GPT-5.5 (xhigh)",
      "sotaDate": "2026-04",
      "leaderboardUrl": "https://matharena.ai/",
      "sourceUrl": "https://arxiv.org/abs/2505.23281",
      "note": "Its core value is contamination resistance, testing models only on problems published after their training cutoff. Final-answer competitions are largely solved by top models, while proof-based ones (e.g. IMO 2025) remain far from saturated.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MMLU",
      "slug": "mmlu",
      "aliases": [
        "Massive Multitask Language Understanding"
      ],
      "category": "knowledge",
      "measures": "Broad academic and professional knowledge across 57 subjects via four-choice multiple-choice questions.",
      "maker": "Hendrycks et al. (UC Berkeley and collaborators)",
      "year": 2021,
      "format": "About 15,900 four-option questions across 57 subjects (STEM, humanities, social sciences, professional exams)",
      "metric": "Accuracy",
      "state": "saturated",
      "sotaScore": "~93%",
      "sotaModel": "Qwen3.7 Max",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/mmlu",
      "sourceUrl": "https://arxiv.org/abs/2009.03300",
      "note": "Saturated, with all frontier models above 90%, so rank differences are mostly noise. It is demonstrably contaminated and contains ground-truth errors, which is why MMLU-Pro and MMLU-Redux were created.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MMLU-Pro",
      "slug": "mmlu-pro",
      "aliases": [],
      "category": "knowledge",
      "measures": "Harder multi-task reasoning and knowledge designed to de-saturate MMLU and reward deliberate reasoning over recall.",
      "maker": "TIGER-Lab (Wang et al., University of Waterloo)",
      "year": 2024,
      "format": "About 12,000 questions across 14 disciplines, expanded from 4 to 10 answer options to cut the guessing baseline",
      "metric": "Accuracy",
      "state": "active",
      "sotaScore": "~90%",
      "sotaModel": "Gemini 3 Pro Preview",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://huggingface.co/spaces/TIGER-Lab/MMLU-Pro",
      "sourceUrl": "https://arxiv.org/abs/2406.01574",
      "note": "Built to replace saturated MMLU: 10 options and reasoning-heavy items drop scores 16 to 33 points and separate frontier models better. By mid-2026 the top tier is compressing again, so it too is approaching saturation.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MMLU-Redux",
      "slug": "mmlu-redux",
      "aliases": [
        "Are We Done with MMLU?"
      ],
      "category": "knowledge",
      "measures": "A re-annotated, error-corrected subset of MMLU used to measure true knowledge accuracy without the original's label noise.",
      "maker": "Gema et al. (University of Edinburgh and collaborators)",
      "year": 2024,
      "format": "A manually re-annotated subset (3,000 questions across 30 subjects, later expanded to 5,700 across all 57) using a defined error taxonomy",
      "metric": "Accuracy on cleaned labels",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/mmlu-redux",
      "sourceUrl": "https://arxiv.org/abs/2406.04127",
      "note": "A diagnostic dataset, not a race: the authors found about 6.5% of MMLU questions contain ground-truth errors, and cleaning the labels materially reorders model rankings.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "SimpleQA",
      "slug": "simpleqa",
      "aliases": [],
      "category": "knowledge",
      "measures": "Short-form parametric factuality: whether a model answers single-answer fact-seeking questions correctly and abstains when unsure.",
      "maker": "OpenAI (Wei, Karina et al.)",
      "year": 2024,
      "format": "4,326 short fact-seeking questions, each with a single indisputable answer, adversarially collected against GPT-4",
      "metric": "Accuracy, plus correct-given-attempted and an F-score balancing attempts against accuracy",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/simpleqa",
      "sourceUrl": "https://arxiv.org/abs/2411.04368",
      "note": "A hard factuality test where pure parametric models historically score well under 50%. Aggregator numbers near 0.97 conflate browsing or correct-given-attempted with raw accuracy, so a clean SOTA is hard to assert.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "RULER",
      "slug": "ruler",
      "aliases": [],
      "category": "long-context",
      "measures": "The real effective context length of a model by testing retrieval, multi-hop tracing, aggregation and QA at increasing sequence lengths.",
      "maker": "NVIDIA (Hsieh, Sun et al.)",
      "year": 2024,
      "format": "13 synthetic tasks across 4 categories (retrieval, multi-hop tracing, aggregation, QA), evaluated from 4K to 128K+ tokens",
      "metric": "Weighted-average accuracy across tasks and lengths; effective length is the longest length still above threshold",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://github.com/NVIDIA/RULER",
      "sourceUrl": "https://arxiv.org/abs/2404.06654",
      "note": "Fully synthetic and regenerable, so contamination is minimal and it is hard to game. It exposes that claimed context windows are often far longer than the effective one.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MRCR",
      "slug": "mrcr",
      "aliases": [
        "Multi-Round Co-reference Resolution",
        "OpenAI-MRCR"
      ],
      "category": "long-context",
      "measures": "Whether a model can distinguish and retrieve the correct one among multiple near-identical requests buried in a long multi-turn conversation.",
      "maker": "Google DeepMind (Michelangelo); open-source variant by OpenAI",
      "year": 2024,
      "format": "Synthetic multi-turn conversations embedding 2, 4 or 8 duplicate requests among distractors, across context bins from 4K up to 1M tokens",
      "metric": "Similarity of the model output to the target instance, gated by a required answer-prefix",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2409.12640",
      "note": "Primarily a vendor-reported eval (Google and OpenAI publish their own numbers), with no neutral live leaderboard. Synthetic generation limits contamination; difficulty scales sharply with needle count and context length.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "NoLiMa",
      "slug": "nolima",
      "aliases": [
        "No Literal Matching"
      ],
      "category": "long-context",
      "measures": "Long-context retrieval and reasoning when the question and the target fact share minimal literal word overlap, forcing latent association rather than keyword matching.",
      "maker": "Adobe Research and LMU Munich (Modarressi et al.)",
      "year": 2025,
      "format": "An associative needle-in-a-haystack set where needles and questions are lexically disjoint, evaluated from short contexts up to 32K (and 128K in extended runs)",
      "metric": "Accuracy at each length, relative to the model's short-context baseline",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://github.com/adobe-research/NoLiMa",
      "sourceUrl": "https://arxiv.org/abs/2502.05167",
      "note": "Designed to defeat the literal-match shortcut that makes vanilla needle tests look saturated. 11 of 13 models fell below half their short-context baseline by 32K, exposing real long-context weakness.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Needle-in-a-Haystack",
      "slug": "needle-in-a-haystack",
      "aliases": [
        "NIAH"
      ],
      "category": "long-context",
      "measures": "Whether a model can recall a single planted fact (the needle) inserted at varying depths within a long context (the haystack).",
      "maker": "Greg Kamradt (independent)",
      "year": 2023,
      "format": "A planted sentence is inserted at many depths across many context lengths; the model is asked to retrieve it, producing a depth-by-length recall heatmap",
      "metric": "Retrieval accuracy at each depth and length cell",
      "state": "saturated",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://github.com/gkamradt/LLMTest_NeedleInAHaystack",
      "note": "The original informal long-context test, widely used by the labs. Now largely saturated on the literal version, which is exactly why harder successors (RULER, NoLiMa, MRCR) were built.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "LongBench",
      "slug": "longbench",
      "aliases": [
        "LongBench v2"
      ],
      "category": "long-context",
      "measures": "Comprehensive long-context understanding across realistic tasks (QA, summarization, few-shot, code, synthetic) in English and Chinese.",
      "maker": "Tsinghua University (THUDM; Bai et al.)",
      "year": 2023,
      "format": "v1: 21 datasets across 6 task types, bilingual. v2 (Dec 2024): 503 hard multiple-choice questions with contexts from 8K to 2M words",
      "metric": "v1: per-task automatic metrics. v2: multiple-choice accuracy",
      "state": "active",
      "sotaScore": "57.7% (v2, with reasoning)",
      "sotaModel": "o1-preview",
      "sotaDate": "2024-12",
      "leaderboardUrl": "https://longbench2.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2412.15204",
      "note": "v1 is now partly saturated; v2 is the harder current standard (best direct-answer model about 50%, o1-preview with reasoning 57.7% versus 53.7% for humans). v2 emphasizes reasoning to limit contamination.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MMMU",
      "slug": "mmmu",
      "aliases": [
        "Massive Multi-discipline Multimodal Understanding"
      ],
      "category": "multimodal",
      "measures": "College-level multimodal understanding and reasoning over images, diagrams, charts and text across many disciplines.",
      "maker": "MMMU team (Yue et al.)",
      "year": 2023,
      "format": "About 11,500 questions across 6 disciplines and 30 subjects, mixing multiple-choice and open-ended items with 30+ image types",
      "metric": "Accuracy",
      "state": "active",
      "sotaScore": "~86%",
      "sotaModel": "Qwen3.6 Plus",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://mmmu-benchmark.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2311.16502",
      "note": "The standard expert-level multimodal QA benchmark; GPT-4V scored about 56% at release, leaving headroom that frontier models have since closed.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MMMU-Pro",
      "slug": "mmmu-pro",
      "aliases": [],
      "category": "multimodal",
      "measures": "A harder, contamination-resistant version of MMMU that forces genuine visual reasoning rather than text-only shortcuts.",
      "maker": "MMMU team (Yue et al.)",
      "year": 2024,
      "format": "Filters out text-only-answerable questions, expands options to up to 10, and adds a setting where the question is embedded inside a screenshot image",
      "metric": "Accuracy",
      "state": "active",
      "sotaScore": "~84%",
      "sotaModel": "Gemini 3.5 Flash",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://mmmu-benchmark.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2409.02813",
      "note": "Scores drop 16 to 27 points versus MMMU because the augmented options and image-embedded questions remove text shortcuts and probe true vision-language reasoning.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MathVista",
      "slug": "mathvista",
      "aliases": [],
      "category": "multimodal",
      "measures": "Mathematical and quantitative reasoning grounded in visual contexts such as figures, charts, geometry and scientific diagrams.",
      "maker": "Lu et al. (UCLA, University of Washington, Microsoft Research)",
      "year": 2023,
      "format": "6,141 examples from 28 existing multimodal datasets plus 3 new ones, mixing multiple-choice and free-form numeric answers",
      "metric": "Accuracy",
      "state": "active",
      "sotaScore": "~91% (testmini)",
      "sotaModel": "Seed 2.1 Pro",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://mathvista.github.io/",
      "sourceUrl": "https://arxiv.org/abs/2310.02255",
      "note": "The standard visual-math benchmark; GPT-4V scored about 50% at release versus 60% for humans, but frontier multimodal models now exceed 90% on the testmini split, so it is nearing saturation.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Video-MME",
      "slug": "video-mme",
      "aliases": [
        "Video Multi-Modal Evaluation"
      ],
      "category": "multimodal",
      "measures": "Comprehensive video understanding by multimodal LLMs across short, medium and long clips.",
      "maker": "MME-Benchmarks team (Fu et al.)",
      "year": 2024,
      "format": "900 manually annotated videos (254 total hours, 11 seconds to 1 hour) across 6 domains, yielding 2,700 multiple-choice QA pairs",
      "metric": "Accuracy (tested with and without subtitles)",
      "state": "active",
      "sotaScore": "~89%",
      "sotaModel": "Seed 2.1 Pro",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://github.com/MME-Benchmarks/Video-MME",
      "sourceUrl": "https://arxiv.org/abs/2405.21075",
      "note": "A leading video-understanding benchmark; subtitle access materially raises scores, so the no-subtitle setting is the cleaner capability signal.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "LMArena",
      "slug": "lmarena",
      "aliases": [
        "Chatbot Arena",
        "LMSYS Chatbot Arena"
      ],
      "category": "preference-holistic",
      "measures": "Crowdsourced human preference between two anonymized model responses, aggregated into a relative ranking, not an objective capability.",
      "maker": "LMArena (formerly LMSYS; Zheng, Chiang et al.)",
      "year": 2023,
      "format": "Open-ended head-to-head battles: users submit a prompt and vote on the better of two blind responses; tens of millions of votes",
      "metric": "Elo / Bradley-Terry pairwise rating (an Arena Score)",
      "state": "active",
      "sotaScore": "~1510 Elo",
      "sotaModel": "Claude Opus 4.8",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://lmarena.ai/leaderboard",
      "sourceUrl": "https://arxiv.org/abs/2403.04132",
      "note": "Measures human preference and is heavily style-influenced (length, formatting, tone), not raw capability. The 2025 paper The Leaderboard Illusion argues private testing and uneven deprecation bias the ratings toward large labs.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "MT-Bench",
      "slug": "mt-bench",
      "aliases": [
        "Multi-Turn Benchmark"
      ],
      "category": "preference-holistic",
      "measures": "Instruction-following and conversational quality on multi-turn prompts, scored automatically by a strong LLM judge.",
      "maker": "LMSYS (Zheng et al., UC Berkeley)",
      "year": 2023,
      "format": "80 curated two-turn questions across 8 categories, with answers rated by an LLM judge",
      "metric": "LLM-as-judge score (1 to 10 scale, averaged)",
      "state": "saturated",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://llm-stats.com/benchmarks/mt-bench",
      "sourceUrl": "https://arxiv.org/abs/2306.05685",
      "note": "Introduced LLM-as-judge alongside Chatbot Arena. Now legacy and saturated: frontier models cluster near the 9 to 10 ceiling, and it carries known judge biases (position, verbosity).",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Artificial Analysis Intelligence Index",
      "slug": "artificial-analysis-intelligence-index",
      "aliases": [
        "AA Intelligence Index",
        "AAII"
      ],
      "category": "preference-holistic",
      "measures": "A composite index of overall model intelligence aggregating performance across reasoning, coding, knowledge, science and agentic tasks.",
      "maker": "Artificial Analysis (independent)",
      "year": 2024,
      "format": "A weighted aggregate of multiple sub-evaluations; the current version combines 9 evals including Humanity's Last Exam, GPQA Diamond, Terminal-Bench and SciCode",
      "metric": "Composite index score (0 to 100 aggregate)",
      "state": "active",
      "sotaScore": "~60 (index)",
      "sotaModel": "Claude Fable 5",
      "sotaDate": "2026-06",
      "leaderboardUrl": "https://artificialanalysis.ai/evaluations/artificial-analysis-intelligence-index",
      "sourceUrl": "https://artificialanalysis.ai/methodology/intelligence-benchmarking",
      "note": "A vendor-maintained composite, not a single test: absolute values shift when the component set is revised, so it is most useful as a one-number capability proxy rather than a precise score.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "HELM",
      "slug": "helm",
      "aliases": [
        "Holistic Evaluation of Language Models",
        "Stanford HELM"
      ],
      "category": "preference-holistic",
      "measures": "Multi-metric holistic evaluation across many scenarios, reporting accuracy alongside calibration, robustness, fairness, bias, toxicity and efficiency.",
      "maker": "Stanford CRFM (Liang, Bommasani et al.)",
      "year": 2022,
      "format": "An open-source framework running models over dozens of scenarios with 7 metric categories, now spanning many sub-leaderboards (Lite, Classic, MMLU, VHELM, MedHELM)",
      "metric": "Multi-metric (per-metric scores across scenarios; no single headline number)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "https://crfm.stanford.edu/helm/",
      "sourceUrl": "https://arxiv.org/abs/2211.09110",
      "note": "A framework, not one score: its point is transparency and breadth across many axes rather than a single ranking, so specific results live on the per-leaderboard pages.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "TruthfulQA",
      "slug": "truthfulqa",
      "aliases": [],
      "category": "safety-factuality",
      "measures": "Whether a model avoids repeating common human misconceptions when answering questions, rather than imitating popular falsehoods.",
      "maker": "Lin, Hilton, Evans (Oxford and OpenAI)",
      "year": 2021,
      "format": "817 questions across 38 categories, designed so a naive imitator gives a false answer; generation and multiple-choice formats",
      "metric": "% truthful (and % truthful-and-informative)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2109.07958",
      "note": "At release the best model was truthful on 58% of questions versus 94% for humans, and larger models were often less truthful. Still cited but aging and partly saturated by RLHF-tuned models, with no single live leaderboard.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "HaluEval",
      "slug": "halueval",
      "aliases": [
        "Hallucination Evaluation Benchmark"
      ],
      "category": "safety-factuality",
      "measures": "A model's ability to recognize hallucinated content across question answering, knowledge-grounded dialogue and summarization.",
      "maker": "Li et al. (Renmin University of China)",
      "year": 2023,
      "format": "35,000 samples: 5,000 human-annotated general responses plus 30,000 task-specific generated examples",
      "metric": "Hallucination-recognition accuracy (faithful vs hallucinated)",
      "state": "active",
      "sotaScore": "",
      "sotaModel": "",
      "sotaDate": "",
      "leaderboardUrl": "",
      "sourceUrl": "https://arxiv.org/abs/2305.11747",
      "note": "Tests whether a model can detect hallucinations rather than how often it hallucinates, and found ChatGPT fabricated unverifiable content in about 19.5% of queries. Distributed as a dataset with no official live leaderboard.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    },
    {
      "name": "Vectara Hallucination Leaderboard",
      "slug": "vectara-hallucination-leaderboard",
      "aliases": [
        "HHEM Leaderboard",
        "Hughes Hallucination Evaluation Model"
      ],
      "category": "safety-factuality",
      "measures": "How often a model introduces unsupported content when summarizing a provided source document, i.e. faithfulness in closed-book summarization.",
      "maker": "Vectara (Hughes et al.)",
      "year": 2023,
      "format": "Each model summarizes short source documents using only the given text, over 7,700 articles; summaries are scored by the HHEM detector model",
      "metric": "Hallucination rate (% of summaries judged unfaithful; lower is better)",
      "state": "active",
      "sotaScore": "1.8% (lower is better)",
      "sotaModel": "antgroup/finix-s1-32b",
      "sotaDate": "2026-05",
      "leaderboardUrl": "https://github.com/vectara/hallucination-leaderboard",
      "sourceUrl": "https://github.com/vectara/hallucination-leaderboard",
      "note": "Measures grounded summarization faithfulness, not open-domain factuality: a low rate means the summary stays inside the source. Scores depend on Vectara's own detector model, and the board updates continuously.",
      "verifiedDate": "2026-06-29",
      "status": "representative"
    }
  ]
}