diff --git a/src/helm/benchmark/static_build/assets/index-15f51007.js b/src/helm/benchmark/static_build/assets/index-15f51007.js
deleted file mode 100644
index 7dd3636a226..00000000000
--- a/src/helm/benchmark/static_build/assets/index-15f51007.js
+++ /dev/null
@@ -1,10 +0,0 @@
-import{r as i,a as Ze,L as b,O as es,d as ss,u as fe,f as Z,H as ts,h as ns,i as U,R as rs}from"./react-d4a0b69b.js";import{g as O,b as z,m as Y,s as pe,a as as,d as ie,y as ls,c as ce,e as ee,l as se}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const a of document.querySelectorAll('link[rel="modulepreload"]'))r(a);new MutationObserver(a=>{for(const l of a)if(l.type==="childList")for(const c of l.addedNodes)c.tagName==="LINK"&&c.rel==="modulepreload"&&r(c)}).observe(document,{childList:!0,subtree:!0});function n(a){const l={};return a.integrity&&(l.integrity=a.integrity),a.referrerPolicy&&(l.referrerPolicy=a.referrerPolicy),a.crossOrigin==="use-credentials"?l.credentials="include":a.crossOrigin==="anonymous"?l.credentials="omit":l.credentials="same-origin",l}function r(a){if(a.ep)return;a.ep=!0;const l=n(a);fetch(a.href,l)}})();var ge={exports:{}},$={};/**
- * @license React
- * react-jsx-runtime.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */var is=i,cs=Symbol.for("react.element"),os=Symbol.for("react.fragment"),ds=Object.prototype.hasOwnProperty,ms=is.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,hs={key:!0,ref:!0,__self:!0,__source:!0};function je(s,t,n){var r,a={},l=null,c=null;n!==void 0&&(l=""+n),t.key!==void 0&&(l=""+t.key),t.ref!==void 0&&(c=t.ref);for(r in t)ds.call(t,r)&&!hs.hasOwnProperty(r)&&(a[r]=t[r]);if(s&&s.defaultProps)for(r in t=s.defaultProps,t)a[r]===void 0&&(a[r]=t[r]);return{$$typeof:cs,type:s,key:l,ref:c,props:a,_owner:ms.current}}$.Fragment=os;$.jsx=je;$.jsxs=je;ge.exports=$;var e=ge.exports,X={},oe=Ze;X.createRoot=oe.createRoot,X.hydrateRoot=oe.hydrateRoot;function xs({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const us=i.forwardRef(xs),be=us;function fs({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const ps=i.forwardRef(fs),gs=ps;function js({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const bs=i.forwardRef(js),ws=bs,we=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,ve=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function vs({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const ys=i.forwardRef(vs),Ns=ys;function As({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Es=i.forwardRef(As),Ms=Es;function Rs({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const Is=i.forwardRef(Rs),Ss=Is;function Ls({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const Cs=i.forwardRef(Ls),ye=Cs;function ks({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const Ts=i.forwardRef(ks),Ps=Ts;function Bs({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const Ds=i.forwardRef(Bs),Hs=Ds;function te(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function Ne(){const[s,t]=i.useState([]),[n,r]=i.useState();return i.useEffect(()=>{if(n&&n.title&&n.title!=="All Leaderboards"){const a=n.title==="Lite"||n.title==="Classic"?"HELM "+n.title:n.title;document.title=a+" - Holistic Evaluation of Language Models (HELM)"}},[n]),i.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(a=>a.json()).then(a=>{if(t(a),window.PROJECT_ID){const l=a.find(c=>c.id===window.PROJECT_ID);r(l)}else{const l=a.find(c=>c.id==="lite");r(l)}}).catch(a=>{console.error("Error fetching JSON:",a)})},[]),n===void 0||n.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[n.title," ",e.jsx(ye,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((a,l)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:te(void 0,a.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:n.title===a.title?"underline":"",children:a.title}),": ",a.description]})},l))})]})}function R(s){return s.startsWith("http://")||s.startsWith("https://")?s:`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function q(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function Us(s){try{return await(await fetch(R(`${q()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function Os(){const[s,t]=i.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[n,r]=i.useState();i.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(m=>m.json()).then(m=>{if(window.PROJECT_ID){const u=m.find(j=>j.id===window.PROJECT_ID);r(u)}else{const u=m.find(j=>j.id==="lite");r(u)}}).catch(m=>{console.error("Error fetching JSON:",m)})},[]),i.useEffect(()=>{const m=new AbortController;async function u(){const j=await Us(m.signal);t(j)}return u(),()=>m.abort()},[]);const a=n!==void 0&&n.releases!==void 0?n.releases:["v1.0.0"],l=s.release||s.suite||null;if(!l)return null;const c=`Release ${l} (${s.date})`;if(a.length<=1)return e.jsx("div",{children:c});const o=a.indexOf(l),d=o<0?e.jsx(O,{color:"blue",children:"preview"}):o===0?e.jsx(O,{color:"blue",children:"latest"}):e.jsx(O,{color:"yellow",children:"stale"});return e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[c," ",d," ",e.jsx(ye,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[50] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:a.map(m=>e.jsx("li",{children:e.jsx("a",{href:te(m,n?n.id:"lite"),className:"block",role:"menuitem",children:m})},m))})]})}function Fs(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(be,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(b,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(b,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(b,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(b,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(b,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:we,className:"object-contain"})}),e.jsx(b,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:ve,className:"object-contain"})}),e.jsx(Ne,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(b,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(b,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(b,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(b,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(b,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(Os,{})})]})})]})}function _s(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(be,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(b,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:we,className:"object-contain"})}),e.jsx(b,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:ve,className:"object-contain"})}),e.jsx(Ne,{})]})]})}function Vs(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(_s,{}):e.jsx(Fs,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(es,{})})})]})}async function D(s){try{return await(await fetch(R(`${q()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function zs({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function V({value:s}){return e.jsx("span",{children:e.jsx(ss,{components:{a:zs},children:s})})}function P({title:s,subtitle:t,markdown:n=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),n&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(V,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const qs={open:"green",limited:"yellow",closed:"red"},Ws={open:"Open",limited:"Limited",closed:"Closed"};function Gs({level:s}){return e.jsx(O,{color:qs[s],children:Ws[s]})}function T(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function Ks(){const[s,t]=i.useState([]);i.useEffect(()=>{const c=new AbortController;async function o(){const d=await D(c.signal);t(d.models)}return o(),()=>c.abort()},[]);const[n,r,a]=s.reduce((c,o)=>{switch(o.access){case"open":c[0]+=1;break;case"limited":c[1]+=1;break;case"closed":c[2]+=1;break}return c},[0,0,0]),l=Object.values(s.reduce((c,o)=>{const d=o.creator_organization;return c[d]===void 0?(c[d]={name:d,models:1},c):(c[d].models+=1,c)},{}));return s.length===0?e.jsx(T,{}):e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(c=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:c.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:c.display_name}),e.jsx("br",{}),e.jsx("span",{children:c.name})]}),e.jsx("td",{children:e.jsx(V,{value:c.description})}),e.jsx("td",{children:e.jsx(Gs,{level:c.access})})]}))})]}),e.jsx(P,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(z,{className:"flex flex-col justify-between",children:[e.jsx(Y,{children:"Models"}),e.jsx(pe,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(as,{values:[n,r,a],colors:["green","yellow","red"]}),e.jsx(ie,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(z,{className:"md:col-span-2",children:[e.jsx(Y,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(ls,{data:l,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(ie,{categories:l.map(c=>c.name),className:"basis-7/12"})]})]})]})]})]})}function K({to:s,children:t,inTable:n=!1,title:r=""}){return n?e.jsx(b,{className:"link link-hover",to:s,title:r,children:t}):e.jsx(b,{className:"link link-primary link-hover",to:s,children:t})}function Qs(){const[s,t]=i.useState([]);i.useEffect(()=>{const r=new AbortController;async function a(){const l=await D(r.signal);t(l.run_groups.filter(c=>!c.todo&&c.taxonomy&&!c.display_name.includes("CLEVA")))}return a(),()=>r.abort()},[]);const n=Object.values(s.reduce((r,a)=>{var c;const l=((c=a.taxonomy)==null?void 0:c.task)||"Unknown";return r[l]===void 0?(r[l]={name:l,value:1},r):(r[l].value+=1,r)},{}));return s.length===0?e.jsx(T,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(r=>{var a,l,c,o,d;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(K,{to:`/groups/${r.name}`,children:e.jsx("span",{className:"text-lg",children:r.display_name})}),e.jsx("span",{className:"block",children:r.name})]}),e.jsx("td",{children:((a=r.taxonomy)==null?void 0:a.task)||""}),e.jsx("td",{children:((l=r.taxonomy)==null?void 0:l.what)||""}),e.jsx("td",{children:((c=r.taxonomy)==null?void 0:c.who)||""}),e.jsx("td",{children:((o=r.taxonomy)==null?void 0:o.when)||""}),e.jsx("td",{children:((d=r.taxonomy)==null?void 0:d.language)||""}),e.jsx("td",{children:e.jsx(V,{value:r.description})})]})})})]}),e.jsx(P,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(z,{className:"flex flex-col",children:[e.jsx(Y,{children:"Total scenarios"}),e.jsx(pe,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(z,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(ce,{data:n.slice(0,Math.floor(n.length/2))}),e.jsx(ce,{data:n.slice(Math.ceil(n.length/2))})]})})]})]})]}))}function $s(){return R(`${q()}/groups.json`)}async function Ae(s){try{return await(await fetch($s(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function ne({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function Q({active:s=!1,onClick:t=()=>{},size:n="md",children:r}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${n} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:r})}function Js({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),i.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const Ys=i.forwardRef(Js),de=Ys;function _(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function me({value:s,title:t,hideIcon:n}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const r=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const a="/runs/?q="+s.run_spec_names.map(c=>`^${c}$`).join("|");return encodeURI(a)}})();return r?e.jsx(K,{to:r,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[_(s.value),!n&&e.jsx(de,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:_(s.value)}):e.jsx(e.Fragment,{children:_(s.value)})}return s.href?e.jsx(K,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[_(s.value),!n&&e.jsx(de,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(V,{value:String(s.value)}):t?e.jsx("a",{title:t,children:_(s.value)}):e.jsx(e.Fragment,{children:_(s.value)})}function re({schema:s,groupTable:t,numRowsToDisplay:n,sortColumnIndex:r=1,sortable:a=!0,displayColumnIndexes:l=void 0,miniStyle:c=!1}){const[o,d]=i.useState(1),[m,u]=i.useState(r);function j(h){return h.length>30?h.substring(0,27)+"...":h}const y=h=>{const p=["AIRBench 2024 -","-book"];if(h.value==="Model/adapter")return"Model";if(p.some(x=>h.value.includes(x))){let x=h.value;return p.forEach(v=>{x=x.replace(v,"")}),j(x)}else return j(h.value)},w=h=>{if(s){const p=s.models.find(x=>x.display_name===h);if(p){let x=p.description;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},N=h=>{d(h===m?o*-1:h===0?-1:1),u(h)},f=h=>{if(s){const p=s.models.find(x=>x.display_name===h);if(p){let x=p.name;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},C=()=>{const h=t.header[m].lower_is_better,p=o*(h?1:-1),x=t.rows.slice();return x.sort((v,S)=>{var E,M;const A=(E=v[m])==null?void 0:E.value,I=(M=S[m])==null?void 0:M.value;return A!==void 0&&I===void 0?-1:I!==void 0&&A===void 0?1:typeof A=="number"&&typeof I=="number"?(A-I)*p:typeof A=="string"&&typeof I=="string"?p===1?A.localeCompare(I):I.localeCompare(A):0}),n>0?x.slice(0,n):x};function g(h){const p=h.lastIndexOf(" - ");return p===-1?h:h.substring(0,p)+"*"+h.substring(p+1)}const H=h=>{const x=g(h).split("*")[0].trim();if(s){const v=s.run_groups.find(S=>S.display_name===x||S.short_display_name===x);if(v)return v.name}return""};return e.jsxs("table",{className:c?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((h,p)=>l===void 0||l.includes(p)).map((h,p)=>e.jsx("th",{className:`${p===m?"bg-gray-100":"bg-white"} ${p===0?"left-0 z-40":""} ${h.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:h.description?h.description:"",children:e.jsxs("div",{className:c?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:y(h)}),a?e.jsx("button",{className:"link",onClick:()=>N(p),children:e.jsx(Ps,{className:"w-6 h-6"})}):null]})},`$${p}`))})}),e.jsx("tbody",{children:C().map((h,p)=>e.jsx("tr",{children:h.filter((x,v)=>l===void 0||l.includes(v)).map((x,v)=>e.jsx("td",{className:`${v===0?"z-20 text-lg sticky left-0":"z-0"} ${p%2===0?"bg-gray-50":"bg-white"}`,children:v==1?e.jsx("div",{className:`${x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(me,{value:{...x,href:"/runs/?q="+f(String(h[0].value))},title:`Click value to see all predictions for: ${f(String(h[0].value))}`})}):e.jsx("div",{className:`${x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":""} ${v===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(me,{value:{...x},title:String(h[0].value)===x.value?w(String(h[0].value)):`Click value to see predictions for ${String(h[0].value)} for ${H(y(t.header[v]))}: ${f(String(h[0].value))}`})})},`${v}`))},`$${h[0].value}`))})]})}function Xs(){const[s,t]=i.useState(0),[n,r]=i.useState(),[a,l]=i.useState();return i.useEffect(()=>{const c=new AbortController;async function o(){const d=D(c.signal),m=Ae(c.signal),u=await d;l(u);const j=await m;r(j)}return o(),()=>c.abort()},[]),n===void 0||a===void 0?e.jsx(T,{}):n.length===0?e.jsxs("div",{children:[e.jsx(P,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(P,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[n.length>1?e.jsx(ne,{children:n.map((c,o)=>e.jsx(Q,{active:o===s,onClick:()=>t(o),children:c.title},o))}):null,e.jsx(re,{schema:a,groupTable:n[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function Ee(s,t){try{return await(await fetch(R(`${q()}/groups/${s}.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function Me({schema:s,runGroupName:t,numRowsToDisplay:n=-1}){const[r,a]=i.useState(),[l,c]=i.useState(0);return i.useEffect(()=>{const o=new AbortController;async function d(){const m=await Ee(t,o.signal);a(m)}return d(),()=>o.abort()},[s,t]),r===void 0||r.length===0?e.jsx(T,{}):r.length===0?e.jsx("div",{children:"Group currently has no tables."}):e.jsxs("div",{children:[r.length>1?e.jsx(ne,{children:r.map((o,d)=>e.jsx(Q,{active:d===l,onClick:()=>c(d),children:o.title},d))}):null,e.jsx(re,{schema:s,groupTable:r[l],numRowsToDisplay:n,sortColumnIndex:1},`${t}-${l}`)]})}function Zs(){const{groupName:s}=fe(),[t,n]=i.useState(void 0);i.useEffect(()=>{const l=new AbortController;async function c(){const d=await D(l.signal);n(d)}return c(),()=>l.abort()},[]);const a=(()=>{if(t!==void 0){for(const l of t.run_groups)if(l.name===s)return l}})();return t===void 0?e.jsx(T,{}):a===void 0?e.jsxs("div",{children:['Group "',s,'" not found.']}):e.jsxs(e.Fragment,{children:[e.jsx(P,{title:a.display_name,subtitle:a.description,markdown:!0,className:"mr-8"}),e.jsx(Me,{schema:t,runGroupName:a.name},a.name)]})}async function et(s){try{return await(await fetch(R(`${q()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function ae({currentPage:s,totalPages:t,onNextPage:n,onPrevPage:r,className:a}){let l="join";return a!==void 0&&(l=`join ${a}`),e.jsxs("div",{className:l,children:[e.jsx("button",{onClick:r,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:n,className:"join-item btn",children:"»"})]})}const J=100;function st(){const[s,t]=Z(),[n,r]=i.useState(),[a,l]=i.useState(Number(s.get("page")||1)),[c,o]=i.useState(!0),[d,m]=i.useState(s.get("q")||"");i.useEffect(()=>{const f=new AbortController;async function C(){const g=await et(f.signal);r(g)}return C(),()=>f.abort()},[]);const u=f=>{f.preventDefault();const g=f.target.q.value;m(g),t({q:g,page:"1"})};if(n===void 0)return e.jsx(T,{});const j=c?new RegExp(d):null,y=n.filter(f=>j?j.test(f.name):f.name.includes(d)),w=y.slice((a-1)*J,a*J),N=Math.ceil(y.length/J);return e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:u,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:d,onChange:f=>m(f.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:c,onChange:()=>o(!c)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${y.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(Hs,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:w.map((f,C)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(K,{to:`/runs/${f.name}`,children:f.name})}),e.jsx("td",{children:f.adapter_spec.model}),e.jsx("td",{children:f.groups.join(", ")}),e.jsx("td",{children:f.adapter_spec.method}),e.jsx("td",{children:f.scenario_spec.args.subject||f.scenario_spec.args.task||"-"})]},`${f.name}-${C}`))})]})}),N>0?e.jsx(ae,{className:"flex justify-center my-8",onNextPage:()=>{const f=Math.min(a+1,N);l(f),s.set("page",String(f)),t(s)},onPrevPage:()=>{const f=Math.max(a-1,1);l(f),s.set("page",String(f)),t(s)},currentPage:a,totalPages:N}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function B(){return window.SUITE!==void 0?window.SUITE:void 0}async function tt(s,t,n){try{return await(await fetch(R(`/runs/${n||B()}/${s}/scenario.json`),{signal:t})).json()}catch(r){r instanceof Error&&r.name!=="AbortError"&&console.log(r);return}}function Re(s,t){return R(`/runs/${t||B()}/${s}/run_spec.json`)}async function nt(s,t,n){try{return await(await fetch(Re(s,n),{signal:t})).json()}catch(r){r instanceof Error&&r.name!=="AbortError"&&console.log(r);return}}function rt(s,t){return R(`/runs/${t||B()}/${s}/scenario_state.json`)}function Ie(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function at(s){try{return await(await fetch(R(`/releases/${Ie()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function lt(s,t){return Ie()?s[t]:window.SUITE}function it(s){const n={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},r=Object.keys(s);for(const a of r)if(s[a]!==void 0&&n[a]!==void 0)return n[a]?s[a]<.5?[a,!0]:[a,!1]:s[a]>=.5?[a,!0]:[a,!1];return["",!1]}function ct(s){const[t,n]=it(s.stats);return t===""?null:n?e.jsx(ot,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(dt,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function ot({value:s}){return e.jsx(O,{icon:gs,color:"green",children:s})}function dt({value:s}){return e.jsx(O,{icon:ws,color:"red",children:s})}function F({value:s}){const[t,n]=i.useState(!1),[r,a]=i.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>n(!0),onMouseOut:()=>n(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>a(!0),children:e.jsx(Ss,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:r,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>a(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function Se({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=R(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else if(s.content_type.includes("audio")){if(s.location===void 0)return null;const t=R(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsx("div",{children:e.jsx("audio",{controls:!0,src:t})})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function Le({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(Se,{mediaObject:t}))})}function mt(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function ht({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(F,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(Le,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(ee,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,n)=>e.jsxs(se,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:mt(s.request[t])}):"null"]},n+1))})]})}function xt(s){return e.jsx("div",{children:s.map((t,n)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(F,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(F,{value:t.text})," "]}),t.media_object&&e.jsx(Se,{mediaObject:t.media_object})]},n))})}function ut(s){return e.jsx("div",{children:Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(F,{value:n===null?"null":n.toString()})]}))})}function ft({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,n])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(n)?xt(n):ut(n)})]},t)):null})}function pt({predictions:s,requests:t,metricFieldMap:n}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((r,a)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",r.train_trial_index]}):null,e.jsx("div",{className:"mt-2 w-full",children:r.base64_images&&r.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),r.base64_images.map(l=>e.jsx("img",{src:"data:image;base64,"+l,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(ct,{stats:r.stats})]}),e.jsx(F,{value:r.predicted_text}),r.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(F,{value:String(r.mapped_output)})]}):null]})}),e.jsx(ft,{predictionAnnotations:r.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(ee,{children:Object.keys(r.stats).map((l,c)=>e.jsxs(se,{children:[n[l]?e.jsx("span",{title:n[l].description,children:n[l].display_name}):e.jsx("span",{children:l}),e.jsx("span",{children:String(r.stats[l])})]},c))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(ht,{request:t[a]})})]})]},a))})})}const gt="correct";function jt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,n)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(r=>e.jsx(O,{className:"mx-2",color:r===gt?"green":void 0,children:r}))]},n))})]})}function bt({instance:s,requests:t,predictions:n,metricFieldMap:r}){return e.jsxs("div",{children:[e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(Le,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('
0?e.jsx(jt,{references:s.references}):null}),e.jsx("div",{children:n&&t?e.jsx(pt,{predictions:n,requests:t,metricFieldMap:r}):null})]})}async function he(s,t,n,r){const a=m=>Uint8Array.from(atob(m),u=>u.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",a(t),"AES-GCM",!0,["decrypt"]),c=new Uint8Array([...a(s),...a(r)]),o=a(n),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:o},l,c);return new TextDecoder().decode(d)}async function wt(s,t,n,r){try{const l=await(await fetch(R(`/runs/${n||B()}/${s}/instances.json`),{signal:t})).json();if(s.includes("gpqa")&&r){const o=await(await fetch(R(`/runs/${n||B()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const m=o[d.input.text];m&&(d.input.text="encrypted",d.input.text=await he(m.ciphertext,m.key,m.iv,m.tag));for(const u of d.references){const j=o[u.output.text];j&&(u.output.text=await he(j.ciphertext,j.key,j.iv,j.tag))}}}return l}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function vt(s,t,n,r){const a=m=>Uint8Array.from(atob(m),u=>u.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",a(t),"AES-GCM",!0,["decrypt"]),c=new Uint8Array([...a(s),...a(r)]),o=a(n),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:o},l,c);return new TextDecoder().decode(d)}async function yt(s,t,n,r){try{const l=await(await fetch(R(`/runs/${n||B()}/${s}/display_predictions.json`),{signal:t})).json();if(s.includes("gpqa")&&r){const o=await(await fetch(R(`/runs/${n||B()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const m=d.predicted_text,u=o[m];if(u)try{d.predicted_text=await vt(u.ciphertext,u.key,u.iv,u.tag)}catch(j){console.error(`Failed to decrypt predicted_text for instance_id: ${d.instance_id}`,j)}}}return l}catch(a){return a instanceof Error&&a.name==="AbortError"&&console.log(a),[]}}async function Nt(s,t,n,r){const a=m=>Uint8Array.from(atob(m),u=>u.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",a(t),"AES-GCM",!0,["decrypt"]),c=new Uint8Array([...a(s),...a(r)]),o=a(n),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:o},l,c);return new TextDecoder().decode(d)}async function At(s,t,n,r){try{const l=await(await fetch(R(`/runs/${n||B()}/${s}/display_requests.json`),{signal:t})).json();if(s.startsWith("gpqa")&&r){const o=await(await fetch(R(`/runs/${n||B()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const m=d.request.prompt,u=o[m];if(u)try{d.request.prompt=await Nt(u.ciphertext,u.key,u.iv,u.tag)}catch(j){console.error(`Failed to decrypt prompt for instance_id: ${d.instance_id}`,j)}}}return l}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}const W=10;function Et({runName:s,suite:t,metricFieldMap:n,userAgreed:r}){const[a,l]=Z(),[c,o]=i.useState([]),[d,m]=i.useState(),[u,j]=i.useState(),[y,w]=i.useState(1);i.useEffect(()=>{const g=new AbortController;async function H(){const h=g.signal,[p,x,v]=await Promise.all([wt(s,h,t,r),yt(s,h,t,r),At(s,h,t,r)]);o(p);const S={};v.forEach(I=>{var k;const E=I.instance_id,M=((k=I.perturbation)==null?void 0:k.name)||"";S[E]===void 0&&(S[E]={}),S[E][M]===void 0&&(S[E][M]=[]),S[E][M].push(I)}),j(S);const A={};x.forEach(I=>{var k;const E=I.instance_id,M=((k=I.perturbation)==null?void 0:k.name)||"";A[E]===void 0&&(A[E]={}),A[E][M]===void 0&&(A[E][M]=[]),A[E][M].push(I)}),m(A)}return H(),()=>g.abort()},[s,t,r]);const N=c.slice((y-1)*W,(y-1)*W+W),f=Math.ceil(c.length/W);i.useEffect(()=>{const g=a.get("instance");if(g&&!window.helmHasScrolledToInstance&&N.length>0){if(N.findIndex(h=>h.id===g)===-1)return;requestAnimationFrame(()=>{const h=document.getElementById(`instance-${g}`);h&&h.scrollIntoView({behavior:"smooth"})}),window.helmHasScrolledToInstance=!0}},[a,y,l,N]);const C=g=>g.perturbation===void 0?`Instance id: ${g.id} [split: ${g.split}]`:`Instance id: ${g.id} [split: ${g.split}][perturbation: ${g.perturbation.name}]`;return d===void 0||u===void 0?e.jsx(T,{}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:N.map((g,H)=>{var h,p;return e.jsxs("div",{id:"instance-"+g.id,className:"border p-4",children:[e.jsxs("div",{className:"flex items-center justify-between",children:[e.jsx("h3",{className:"text-xl mb-4",children:C(g)}),e.jsx("button",{className:"btn btn-sm normal-case px-2 py-1",onClick:()=>{const x=window.location.href+(window.location.href.includes("?")?"&instance=":"?instance=")+g.id;navigator.clipboard.writeText(x)},children:"Copy Link"})]}),e.jsx(bt,{instance:g,requests:u[g.id][((h=g.perturbation)==null?void 0:h.name)||""],predictions:d[g.id][((p=g.perturbation)==null?void 0:p.name)||""],metricFieldMap:n},`${g.id}-${H}`)]})})}),e.jsx(ae,{className:"flex justify-center my-8",onNextPage:()=>{const g=Math.min(y+1,f);w(g),a.set("instancesPage",String(g)),l(a)},onPrevPage:()=>{const g=Math.max(y-1,1);w(g),a.set("instancesPage",String(g)),l(a)},currentPage:y,totalPages:f})]})}async function Mt(s,t,n){try{return await(await fetch(R(`/runs/${n||B()}/${s}/stats.json`),{signal:t})).json()}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}function Rt({stat:s,metricFieldMap:t}){const n=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),n]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),n]})}const G=50,xe=["name","mean","min","max","sum","sum_squared","variance","stddev"];function It({runName:s,suite:t,metricFieldMap:n}){const[r,a]=Z(),[l,c]=i.useState(),[o,d]=i.useState(1),[m,u]=i.useState("");if(i.useEffect(()=>{const w=new AbortController;async function N(){const f=w.signal,C=await Mt(s,f,t);c(C)}return N(),()=>w.abort()},[s,t]),l===void 0||l.length===0)return e.jsx(T,{});const j=Math.floor(l.length/G),y=l.slice((o-1)*G,(o-1)*G+G);return e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:w=>u(w.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:xe.map(w=>e.jsx("th",{children:w},w))})}),e.jsx("tbody",{children:y.filter(w=>!m||w.name.name.toLowerCase().includes(m.toLowerCase())).map(w=>e.jsx("tr",{children:xe.map(N=>{const f=w[N];return typeof f=="number"?e.jsx("td",{children:f}):e.jsx("td",{children:e.jsx(Rt,{stat:w,metricFieldMap:n})},N)})}))})]})}),e.jsx(ae,{className:"flex justify-center my-8",onNextPage:()=>{const w=Math.min(o+1,j);d(w),r.set("metricsPage",String(w)),a(r)},onPrevPage:()=>{const w=Math.max(o-1,1);d(w),r.set("metricsPage",String(w)),a(r)},currentPage:o,totalPages:j})]})}function St(){const{runName:s}=fe(),[t,n]=i.useState(0),[r,a]=i.useState(),[l,c]=i.useState(),[o,d]=i.useState(),[m,u]=i.useState(),[j,y]=i.useState({}),[w,N]=i.useState({}),[f,C]=i.useState(""),[g,H]=i.useState(!1);if(i.useEffect(()=>{const p=new AbortController;async function x(){const v=p.signal;if(s===void 0)return()=>p.abort();const S=window.SUITE?window.SUITE:lt(await at(v),s);c(S);const[A,I,E]=await Promise.all([nt(s,v,S),tt(s,v,S),D(v)]);a(A),u(I),N(E.metrics.reduce((M,k)=>(M[k.name]=k,M),{})),y(E.adapter.reduce((M,k)=>(M[k.name]=k,M),{})),d(E.models.find(M=>M.name===(A==null?void 0:A.adapter_spec.model)))}return x(),()=>p.abort()},[s]),r===void 0||m===void 0||s===void 0||l===void 0||w===void 0)return e.jsx(T,{});const h=()=>{f.trim()==="Yes, I agree"?H(!0):(H(!1),alert("Please type 'Yes, I agree' exactly."))};return e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[m.name,e.jsx("a",{href:"/#/groups/"+m.name,children:e.jsx(Ms,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(V,{value:m.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:r.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(V,{value:(o==null?void 0:o.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:m.tags.map(p=>e.jsx(O,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:p})}))})]})}),e.jsxs(z,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(Ns,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:Re(r.name,l),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:rt(r.name,l),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(ee,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(r.adapter_spec).map(([p,x],v)=>e.jsxs(se,{className:v<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:j[p]?j[p].description:void 0,children:`${p}: `}),e.jsx("span",{className:"overflow-x-auto",children:x})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(ne,{children:[e.jsx(Q,{size:"lg",active:t===0,onClick:()=>n(0),children:"Instances + Predictions"}),e.jsx(Q,{size:"lg",active:t===1,onClick:()=>n(1),children:"All metrics"})]})}),t===0&&s.includes("gpqa")&&!g&&e.jsxs("div",{className:"mb-8",children:[e.jsx("hr",{className:"my-4"}),e.jsx("p",{className:"mb-4",children:"The GPQA dataset instances are encrypted by default to comply with the following request:"}),e.jsx("blockquote",{className:"italic border-l-4 border-gray-300 pl-4 text-gray-700 mb-4",children:"“We ask that you do not reveal examples from this dataset in plain text or images online, to minimize the risk of these instances being included in foundation model training corpora.”"}),e.jsxs("p",{className:"mb-4",children:["If you agree to this condition, please type"," ",e.jsx("strong",{children:'"Yes, I agree"'})," in the box below and then click"," ",e.jsx("strong",{children:"Decrypt"}),"."]}),e.jsxs("div",{className:"flex gap-2 mt-2",children:[e.jsx("input",{type:"text",value:f,onChange:p=>C(p.target.value),className:"input input-bordered",placeholder:'Type "Yes, I agree"'}),e.jsx("button",{onClick:h,className:"btn btn-primary",children:"Decrypt"})]}),e.jsx("hr",{className:"my-4"})]}),t===0?e.jsx(Et,{runName:s,suite:l,metricFieldMap:w,userAgreed:g},g?"instances-agreed":"instances-not-agreed"):e.jsx(It,{runName:s,suite:l,metricFieldMap:w})]})}function Lt(){const[s,t]=i.useState(void 0),[n,r]=i.useState(void 0),[a,l]=i.useState(void 0);if(i.useEffect(()=>{const o=new AbortController;async function d(){const m=D(o.signal),u=Ae(o.signal),j=await m;t(j);const y=await u,w=[];y.forEach(N=>{N.rows.forEach(f=>{w.push({title:String(f[0].value),name:f[0].href.replace("?group=","")})})}),r(w)}return d(),()=>o.abort()},[]),s===void 0||n===void 0)return e.jsx(T,{});if(n.length===0)return e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]});const c=a!==void 0?a:n[0].name;return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(P,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",onChange:o=>l(o.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:n.map((o,d)=>e.jsx("option",{value:o.name,children:o.title},d))})]})]}),e.jsx(Me,{schema:s,runGroupName:c},c)]})}const Ct=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,kt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Tt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Ct,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:kt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function Ce({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,n)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},n):e.jsx(b,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},n)}))})]})}function ke({runGroups:s}){const t=new Map(s.filter(a=>a.metric_groups!==void 0&&(a.subgroups===void 0||a.subgroups.length===0)).map(a=>[a.name,a])),n=new Set,r=[];return s.forEach(a=>{const l=a.subgroups?a.subgroups:[],c=[];l.forEach(o=>{const d=t.get(o);d&&(c.push(d),n.add(d.name))}),c.length>0&&r.push([a,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," scenarios"]}),e.jsx("ul",{children:r.map(([a,l])=>e.jsxs("li",{className:"my-3",children:[e.jsx(b,{className:"text-black",to:"groups/"+a.name,children:e.jsx("h2",{children:a.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:l.map(c=>c.todo?e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name):e.jsx(b,{className:"text-black",to:"groups/"+c.name,children:e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name)}))})]},a.name))})]})}const Te=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function L({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:n=10,sortColumnIndex:r=1}){const[a,l]=i.useState(void 0),[c,o]=i.useState(void 0);return i.useEffect(()=>{const d=new AbortController;async function m(){const u=await D(d.signal);l(u);const j=u.run_groups;if(j.length===0)return;const y=s||j[0].name,w=await Ee(y,d.signal);o(w[t])}return m(),()=>d.abort()},[s,t]),a===void 0||c===void 0?e.jsx(T,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(re,{schema:a,groupTable:c,numRowsToDisplay:n,sortColumnIndex:r,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function Pt(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Te,alt:"HELM Hero",className:"object-contain w-96"})}),e.jsxs("div",{className:"py-2 rounded-xl bg-gray-100 h-full",children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-2 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Pe=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,Be=""+new URL("aisingapore-6dfc9acf.png",import.meta.url).href,De=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,He=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,Ue=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,Oe=""+new URL("cohere-3550c6cb.png",import.meta.url).href,Fe=""+new URL("eleutherai-b9451114.png",import.meta.url).href,_e=""+new URL("google-06d997ad.png",import.meta.url).href,le="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAoAAAAEBCAMAAADfF+TxAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAC0FBMVEUAAAAgcMIfcMEfcMEfccEfcMEfcMEfb8Efb8AfccAfcMEfbsEfbMEAAP8gcMIggL8fcMEfcMEfcMAfcMEeccMecMEecMIfcMIfcMAeccEndsQfbsEfcMAfb8EfccAeb8EXdLkgccEfcMEfcMEecMEhccEAgP8fb8EAVaoeb8IfcMIeccEaZswebsIfcMEgcMEac78fccIeacMfcMEfcMEecMEcccYrgNUfcMIeb8EfcMIfccEfcMEeb8AfcMIjdMUfcMEfcMEjcsEcccYecMAeccEfb8IfcMEgccIfcMEkbcgfb8IfccEfcMEdbb0fcMIkbcIfcMEba7wfcMIfcMEfcMEfcMEfb8IgccAfccAgcMEfcMEgcMEfb8EfccAfcMEfccAeb8MgcMEgc78ecMIfccEfcMEhb8IgccEfb8IfcMEecMEfcMIdcb8fcMEdbMQfcMEfcMEVar8fb8IfcMEgccIfcMAfcMEgcL8gcMEfcMEfcMEgccIicsMfcMIeb8AecMEfcMIfccEeb8AfcMIgccEgcMAgcL8gcL8fb8EfcMEfb8EfccEfcMEAgL8fcMEgb8Ifb8Igdb8gccIfcMIfcMEfb8Egb8EcccEfb8EeccAfcMEgccEfcMEdb8AgcMEgb8EzZswkbbYecMAgcr8hccIgcMEfccEgcMIgcMEfb8Edbr8fccEdb8Efb8Eccb0fb8AfcMEgccIfcMEfcMAhb8EecMEfcMEfcMAfcMIfcMAgcMEfcMEecMAfcMEid7sfcsAecb8ib8QfcMEfcMEfb8EfcMEeb78ecMEfcMEfcMAhb7wfcMAfcsEfccAfccEfcMEeb8AfcMAccb8fcMEdccAgcMIgcL8bbb8fb8EfcMEfcMEeb8EfccEgcMEecMAeb8EecMAeb8AebsMgccIeccQgcb8gcMEhb8QfcMIfccEecMEebcIfb8EfcMH////Dl82sAAAA7nRSTlMAcM/y7uPRvqeKa0ohAYkI8ciSWyLiVOCbVg06q/ezXguR9vCgRgLnA1zcbwpD6oAUmhH59Y8SBmR3S7zZVaIW+7YdCW1mU8Sq/g51zJ0jwxX6E31itN23YVGy5bG1o8t6TMooO9X8LoiF5tq7NP0ai/QMbN9YOaQgkO3kPyaWZbAyc36/eFlAEJXvjIN7BOlHnhhPpvPXVy2uTYRf1jWZZwUHnzg2ocV50qUsrE7HG47oaMCCPpjhco1JwttdlA9BRB5CdHy9PMnT2BevMWqcKUXQJOw9UDAcY63rh96phpd2bjNxK0i6JxlaqCrOBCMOsQAAAAFiS0dE77iw4qEAAAAHdElNRQfoBhEVHhJsM9kZAAAK9ElEQVR42u3d+Z/XRR3A8UEOS/gK2QqILCIL666AEATLEYfIsV/ABBQhFYSVgCXkKpCIU6UAFREKCwrkMlSSzMoslIRKDeUyCcUOOuyev6EUEWH3+92d2fl85j3T6/Wz85mZz/e5DxXmO6u0x+qpC7tIS6h+g4aNLv7Yxy9p3CSjHFQvlUVfqtzWtFkqy1YAzEvxE5d9siAMgJdn3AJsrgEopBYtr2glH6C+0i3A1gAUVGGbq9pKB3i1U3+t2gFQVkXtOxSLBnhNiUuApRqA4rq2YyfBAHVnlwDbAFBi13XpKhfgpxz669YdgDL7dI+eUgE2KHMHsJcGoNQa9C6TCVD3cQewLwAF95l+MgH2d+ZvwEAAiu76QRIB1r/BFcDBGoCyKxoiEKAe6grgMACKr7ynPIDljvwNygJQfg07iwNYONwNwBEagAGU7SINoC51A3AkAMPoxjJhANs48fdZDcBAummULICjx7gAeHOaAFt47JYqWx97a2CNq8VfzY1PcT2fcwHwthQXrIiIiIiIiIiIiIiIiIiIiIiIiIgo/G73WK8qqwnuQOr53VHtKx6f+jo62GqYkPpSOZLvtGo/+XqpL2Oipb+CbOpLBaDT7syIAGh7RcIkDcDAmywCoG5uB7ARAEMHWCEDYGsrf3dpAIYOUE8RAdDuioQuAAwfYLkIgHZXJHwegOEDnNpEBECbKxKmaQCGD1BPFwHQ5oqESgDGAHBGiQSAupc5wC8AMAaAVf8w2gvAmcb+7tYAjAJgfxEAZ802BTgHgHEAnDtPAkD9RUN/mS8BMA6Aer4IgDcZAlygARgJwHtEAMwuNAP4ZQDGAnBRiQSAeoSRv+KvADAWgBf+5l5PAEcaAVzs6V2pgR4bHCnAShEA9RITgEt9ASQiIiIiIiIiIiIiIiIiIiIiIiKi/4uWVVi0PKnVrLBZTcU8m6nuHWnbfaZT3T8yoe7sWdPUK5OaeuRXHX3kVpcmtUgK4NesDt6uspmqh/U539WmU61J7MhxTVckFDdM8SsXAEwJ4ANyAPatYeYEvwwCQG8AH5QDsKYrEpYCMEKAD8kBqAfnnXjtwwCMEOAQQQCH5Z14nQZghAAnCwKY/4qERwAYI8AVggDmvSKhbBEAYwS4XhLAfFckbNAAjBBgYYkkgPmuSPg6AGMEeJESBfDmnNN2+gYAYwQ4URbAjTmnfVQDMEaAK2UB1N/MNe23ABglwE3CAG7OMevwdgCMEWB2gDCAt2aqn/XbGoAxAvyOEgZQb6l+1usBGCXAreIAVn86p+toAEYJsFQcwNuLq5v0MR0GwLYDLBqTFMBtNqsZkLGZqlM3u9aaT7WqW7JlXO6vtm1XREREREREREREREREREREREREJKgNOyyak9RqlmyV207rXfVIY3nnLo3blcrr6OzoI4/hRHQqbbQ/gWl1Ijpr+s8XnJ3u8aSn4ki+h2ZNUekC/K7pgEkfzNa0yPRHaw0A5QNcrVIGuNv0Yo0nPphtqPHlHgCUD3DGqrQB1nvSdMRTZ2bbYzpuCQDFA8xOVqkD/J7dFQmtTC9EGKkAKB7gpSp9gMbXS565IuEq04lGAFA8wKe3eQCobrS6IsH0QoTsQgBKB3jdXuUDYAfTId9XFhciDFMAFA5w1jrlBWDJM6b/p1Si1A9M5xkMQOkA6/yqbT/g1aZjFiv1rOlP12wACgf4Q+UL4I9Mx/xYzZ5qOKSvAqBsgFuVN4CZ5wzHLCr7iek0ywAoG+Ada/0BVM+bDvppheGA0WMAKBrg2KbKI8BdpoN+ZnquYIICoGSAT25TPgGqfUlv8AUASgb4YrHyC7Ay4Q0W7gegYIDTHe3K/gOelvAO9yjvABf+3KKdSQHs9pKgdjn7sbKZ/cD7Qw8mu8WFdV0fERERERERERERERERERERERERCelgpUW/SGo1BdMFNanM0a5+afOOrzw3vqTS/eZecri+OsWJ6Nz9ap6bXRXV9cRxX/d7e9nl+gCYUK/0lAFwmfOdNSwGYAAA9auzRQDs1t31xp5XAAwBoP71IQkA1VjX+9oFwDAA6idWSQD4guNdPZcBYCAA9f3bBQA0vnfX6AunAJQMUL9W4h+g+c3j+XsdgOEA1JcJANjP6Y6uVQAMCKAe7x9g02YuN/QgAIMCWPi6d4DqHpcbegqAQQHU+/Z7B7jJ4XY2KgCGBVAvz/gGuG2Ru93cB8DQAOrDvgGqI+42cxSAwQGcW+Ab4DFnezmuABgcQD3ON0Dj3xqSs90ADBDgmctsPQJUL7rayRsADBHgbw55Bvimo41UKFEAp5VadCIpgKP6yW2J9a6usHnHd134lOLmbvbx24TWR0RERERERERERERERERERERERD7bctii0qRWM9tmNYetbjJdMMSuk+ZTvWWzqWnVP+vQEPsap7A+42I4EW11gVUPywPFFieBXZ44blpkfxZ6dwrrA2DSAN/2C1DtsPaXbQLACACeyvgF2MEa4DAFwAgAVvlad8oAS66xXfglAIwCYKlfgOoyy3UPPATAKAC+4xngAst1X68AGAXA2zwDVPtcf7MegEEB/J1vgJutll04HIBxAMw29QzwqNWyf68AGAdAXeAZoDpu87wTAIwF4Mu+Afa2eNya7QCMBWA/3wAHZc0f11IBMBaAHX0DVDeZP+4gAKMBeJ93gIeNnzajBIDRAPyDd4BdR5s+7XkFwGgA9vAOUE0wfdrdAIwH4B/9Ayw1fNg+JRbglNMWPZYUwEE2qzlt9fssO/e2rbHpVH+y2dSf8z1xnuGS16W8PiIiIiIiIiIiIiIiIiIiIiIiInLaBo9V/fbssX4xtOX8TRWk9/6qdKgWy+1QCyaNk9qD0h6remL2Ih1Dfzl/U/XSe39VGt6u5scsrQXAoqT2AMAE+qscgOptN9+iB2BAXZMRBLB5jU95LgPAuAC2V4IA1nxjea2+QgrAgHpUEkDVvqanPAXAuADObSsK4LoaHnJcATAugBOVKIAlz+R/yGkARgbwXVkAa7ixPLsQgHEBfHitMID5byx/VgEwLoCVShjA/DeWDwFgXABHnxQHMN+N5d3HADAugP2VOID5bixfrgAYF8At8gCqRrkf0Q+AcQGcqQQCXJnzCc1uAGBUALM7JQI8kPPG8h0KgFEBLFcSAea+sfwYAKMCOPWoTICHczyg6p9Zpg/wao8trrLPjtOD7q1qP7zF6b2/HHXNsd75tT44/05Se+A7CURERERERERERERERERERERERG56w2P7q6zmZEFoDa/5Fe9P7/3lrdVHln3AkEmTpPbAiei6Na4WAP2fiD7TR7+e2doQIEfyZXZ1iQoHoHr13NChAIwA4MDDtfrwxAB86MOR9W8AYPgAT9XyOxliABZ8+PXMxxUAgwf4t70qMIDqlbMj3wVg6ADbdcyo4ACeXcmpbQAMHGDF380/dv8A/zH1zMClCoBBA6y/cpsKEaC6+MzANwEYMsBZ/ddb/YtPAMBl74+7vBiAAQN8eqflf3kJANit+3vj5igABgvwlRNKhQtQtXlv3AoABgowO3OTxV+3SwL4z/8N26cAGCTAwpZ3WZ33kASw7VytuwAwQIBTh3XsanngSBJA9S+t/w3A0ADOGtZxvbJOFMAN+j8KgEEBvPXI/FF1OnIpCmDZohEADAXgrH2vbd5wqM5nfkUBVEcWygK4x2N9quzz/BtSl2416slyo8bOzNWe8kce2H3LsSXb3Rw675Pe+6tFe6328HhSe/gvDuj5ccZcNDsAAAAldEVYdGRhdGU6Y3JlYXRlADIwMjQtMDYtMTdUMjE6MzA6MTgrMDA6MDDt4fgHAAAAJXRFWHRkYXRlOm1vZGlmeQAyMDI0LTA2LTE3VDIxOjMwOjE4KzAwOjAwnLxAuwAAAABJRU5ErkJggg==",Ve=""+new URL("meta-5580e9f1.png",import.meta.url).href,ze=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,qe=""+new URL("mistral-18e1be23.png",import.meta.url).href,We=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,Ge=""+new URL("openai-3f8653e4.png",import.meta.url).href,Ke=""+new URL("tii-24de195c.png",import.meta.url).href,Qe=""+new URL("together-a665a35b.png",import.meta.url).href,$e=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,Je="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",Ye=""+new URL("yandex-38e09d70.png",import.meta.url).href,Xe=""+new URL("01-694cb9b7.png",import.meta.url).href,Bt=[Pe,Be,De,He,Ue,Oe,Fe,_e,le,Ve,ze,qe,We,Ge,Ke,Qe,$e,Je,Ye,Xe];function ue(){const[s,t]=i.useState(void 0);return i.useEffect(()=>{const n=new AbortController;async function r(){const a=await D(n.signal);t(a)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Pt,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Bt.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Ce,{models:s.models}),e.jsx(ke,{runGroups:s.run_groups})]})})]})]}):null}function Dt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(L,{})})]})]})}const Ht=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function Ut(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:Ht,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Ot=""+new URL("scb10x-204bd786.png",import.meta.url).href,Ft=""+new URL("scbx-71e53e72.jpg",import.meta.url).href;function _t(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://scbx.com/",children:e.jsx("img",{src:Ft,alt:"Logo",className:"inline h-32 mx-4 my-4"})}),e.jsx("a",{href:"https://scb10x.com/",children:e.jsx("img",{src:Ot,alt:"Logo",className:"inline h-32 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Vt=""+new URL("wellsfargo-a86a6c4a.png",import.meta.url).href;function zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{children:e.jsx("a",{href:"https://wellsfargo.com/",children:e.jsx("img",{src:Vt,alt:"Logo",className:"mx-auto block my-4 w-48"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.wellsfargo.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Wells Fargo"}),", we introduce the ",e.jsx("span",{className:"font-bold",children:"HELM Finance"})," ","leaderboard for ecologically-valid evaluations of leading language models in the financial domain. The leaderboard evaluates the ability of language models to perform tasks from financial professions on publicly financial documents across a range of scenarios."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const qt=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function Wt({metricFieldMap:s,metricGroups:t}){const n=new Set,r=[];return t.forEach(a=>{const l=[];a.metrics.forEach(c=>{const o=s[c.name];o&&(l.push(o),n.add(o.name))}),l.length>0&&r.push([a,l])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," metrics"]}),e.jsx("ul",{children:r.map(([a,l])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:a.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:l.map(c=>e.jsx("li",{className:"ml-4",children:c.display_name},c.name))})]},a.name))})]})}function Gt(){const[s,t]=i.useState(void 0);i.useEffect(()=>{const r=new AbortController;async function a(){const l=await D(r.signal);t(l)}return a(),()=>r.abort()},[]);const n=s?s.metrics.reduce((r,a)=>(r[a.name]=a,r),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:qt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&n?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(Ce,{models:s.models}),e.jsx(ke,{runGroups:s.run_groups}),e.jsx(Wt,{metricFieldMap:n,metricGroups:s.metric_groups})]}):null]})}const Kt=""+new URL("vhelm-framework-a1ca3f3f.png",import.meta.url).href,Qt=""+new URL("vhelm-model-8afb7616.png",import.meta.url).href,$t=""+new URL("vhelm-aspects-1437d673.png",import.meta.url).href;function Jt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Vision-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.07112",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). To address these issues, we introduce VHELM, built on HELM for language models. VHELM aggregates various datasets to cover one or more of the 9 aspects:"," ",e.jsx("b",{children:"visual perception"}),", ",e.jsx("b",{children:"bias"}),", ",e.jsx("b",{children:"fairness"}),", ",e.jsx("b",{children:"knowledge"}),", ",e.jsx("b",{children:"multilinguality"}),", ",e.jsx("b",{children:"reasoning"}),", ",e.jsx("b",{children:"robustness"}),","," ",e.jsx("b",{children:"safety"}),", and ",e.jsx("b",{children:"toxicity"}),". In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. For transparency, we release the raw model generations and complete results on this website."]}),e.jsx("p",{className:"my-4 font-bold",children:"VHELM is intended to be a living benchmark. We hope to continue adding new datasets, models and metrics over time, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:Qt,alt:"A vision-lanuage model (VLM) takes in an image and a text prompt and generates text.",className:""}),e.jsx("img",{src:Kt,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Omni), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(L,{}),e.jsx(b,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:$t,alt:"An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. ",className:""})})]})}const Yt=""+new URL("accenture-6f97eeda.png",import.meta.url).href,Xt=""+new URL("cresta-9e22b983.png",import.meta.url).href;function Zt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://www.accenture.com/",children:e.jsx("img",{src:Yt,alt:"Logo",className:"inline h-12 mx-4 my-4"})}),e.jsx("a",{href:"https://www.cresta.com/",children:e.jsx("img",{src:Xt,alt:"Logo",className:"inline h-8 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.accenture.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Accenture"})," ","and"," ",e.jsx("a",{href:"https://www.cresta.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Cresta"}),", we introduce the HELM"," ",e.jsx("span",{className:"font-bold",children:"Call Center"})," leaderboard. HELM Call Center is a leaderboard consisting of evaluations of leading language models on scenarios with realistic tasks from the call center context."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const en=""+new URL("cuhk-8c5631e9.png",import.meta.url).href;function sn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.cuhk.edu.hk/",children:e.jsx("img",{src:en,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function tn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Tables"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.ibm.com/",children:e.jsx("img",{src:le,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," leaderboard on HELM. ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," is a holistic evaluation of leading language models that tests their capability to understand, process and analyze structured tabular input data."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const nn=({id:s,title:t,text:n})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:te(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})})}));function rn(){const[s,t]=i.useState();return i.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(n=>n.json()).then(n=>{t(n)}).catch(n=>{console.error("Error fetching JSON:",n)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((n,r)=>n.id==="home"?null:e.jsx(nn,{id:n.id,title:n.title,text:n.description},r))})})}function an(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
- mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Te,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const ln=[Pe,Be,De,He,Ue,Oe,Fe,_e,le,Ve,ze,qe,We,Ge,Ke,Qe,$e,Je,Ye,Xe];function cn(){const[s,t]=i.useState(void 0);return i.useEffect(()=>{const n=new AbortController;async function r(){const a=await D(n.signal);t(a)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(an,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(rn,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:ln.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]})})]}):null}const on=""+new URL("overview-74aea3d8.png",import.meta.url).href,dn=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function mn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.22456",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:on,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms from ArXiV papers."}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript. ..."}),e.jsx("li",{children:"Music sheets: crops of measures from IMSLP music sheets."})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(L,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:dn,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function hn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function xn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Medical"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-2",children:"With the increasing scale and impact of language models, there has also been interest interest in using language models in the medical domain. However, the capabilities and risks of these models are not well-understood, and there is significant potential for harm in the medical setting."}),e.jsxs("p",{className:"my-2",children:["To address this, we present the"," ",e.jsx("a",{className:"font-bold",href:"https://arxiv.org/abs/2405.09605",children:"HELM Medical"})," ","leaderboard for evaluation of language models in the medical domain. The HELM Medical leaderboard presents evaluations of leading general-purpose language models as well as language models fine-tuned on the medical domain. These models are evaluated on a range of medical tasks based on the benchmarks used in"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2212.13138",children:"Singhal et al. 2022"}),". We hope that this leaderboard encourages further work in evaluating language models on tasks from the medical domain."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const un=""+new URL("helm-safety-2907a7b6.png",import.meta.url).href;function fn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"HELM Safety"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:un,alt:"Logo",className:"mx-auto p-0 block",style:{width:"300px"}}),e.jsx("p",{children:"Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized public benchmarking of such models is vital. While language models are routinely evaluated on standard capability benchmarks, comparable standardization for benchmarking safety risks lags behind. To address this gap, we introduce HELM-Safety as a collection of 5 safety benchmarks that span 6 risk categories (e.g. violence, fraud, discrimination, sexual, harassment, deception). We present evaluation results for recent leading open weights and closed models."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/11/08/helm-safety.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(L,{})})]})]})}function pn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Capabilities"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{children:"HELM Capabilities is a new leaderboard for benchmarking the capabilities of foundation models, featuring 6 challenging scenarios."}),e.jsxs("div",{className:"flex flex-row justify-center my-4",children:[e.jsx(b,{to:"#",className:"px-10 btn rounded-md mx-4",children:"Blog Post"}),e.jsx(b,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(L,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(b,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function gn(){return window.PROJECT_ID==="lite"?e.jsx(ue,{}):window.PROJECT_ID==="instruct"?e.jsx(Tt,{}):window.PROJECT_ID==="image2struct"?e.jsx(mn,{}):window.PROJECT_ID==="heim"?e.jsx(Gt,{}):window.PROJECT_ID==="mmlu"?e.jsx(Dt,{}):window.PROJECT_ID==="vhelm"?e.jsx(Jt,{}):window.PROJECT_ID==="air-bench"?e.jsx(Ut,{}):window.PROJECT_ID==="thaiexam"?e.jsx(_t,{}):window.PROJECT_ID==="finance"?e.jsx(zt,{}):window.PROJECT_ID==="call-center"?e.jsx(Zt,{}):window.PROJECT_ID==="cleva"?e.jsx(sn,{}):window.PROJECT_ID==="tables"?e.jsx(tn,{}):window.PROJECT_ID==="ewok"?e.jsx(hn,{}):window.PROJECT_ID==="medical"?e.jsx(xn,{}):window.PROJECT_ID==="safety"?e.jsx(fn,{}):window.PROJECT_ID==="capabilities"?e.jsx(pn,{}):window.PROJECT_ID==="home"?e.jsx(cn,{}):e.jsx(ue,{})}function jn(){return e.jsx(ts,{children:e.jsx(ns,{children:e.jsxs(U,{path:"/",element:e.jsx(Vs,{}),children:[e.jsx(U,{index:!0,element:e.jsx(gn,{})}),e.jsx(U,{path:"leaderboard",element:e.jsx(Lt,{})}),e.jsx(U,{path:"models",element:e.jsx(Ks,{})}),e.jsx(U,{path:"scenarios",element:e.jsx(Qs,{})}),e.jsx(U,{path:"groups",element:e.jsx(Xs,{})}),e.jsx(U,{path:"groups/:groupName",element:e.jsx(Zs,{})}),e.jsx(U,{path:"runs",element:e.jsx(st,{})}),e.jsx(U,{path:"runs/:runName",element:e.jsx(St,{})})]})})})}X.createRoot(document.getElementById("root")).render(e.jsx(rs.StrictMode,{children:e.jsx(jn,{})}));
diff --git a/src/helm/benchmark/static_build/assets/index-66622a2b.js b/src/helm/benchmark/static_build/assets/index-66622a2b.js
new file mode 100644
index 00000000000..0826108043a
--- /dev/null
+++ b/src/helm/benchmark/static_build/assets/index-66622a2b.js
@@ -0,0 +1,10 @@
+import{r as i,a as es,L as j,O as ss,d as ts,u as pe,f as ee,H as ns,h as rs,i as H,R as as}from"./react-d4a0b69b.js";import{g as U,b as z,m as X,s as ge,a as ls,d as ce,y as is,c as oe,e as se,l as te}from"./tremor-54a99cc4.js";import"./recharts-6d337683.js";(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const a of document.querySelectorAll('link[rel="modulepreload"]'))r(a);new MutationObserver(a=>{for(const l of a)if(l.type==="childList")for(const c of l.addedNodes)c.tagName==="LINK"&&c.rel==="modulepreload"&&r(c)}).observe(document,{childList:!0,subtree:!0});function n(a){const l={};return a.integrity&&(l.integrity=a.integrity),a.referrerPolicy&&(l.referrerPolicy=a.referrerPolicy),a.crossOrigin==="use-credentials"?l.credentials="include":a.crossOrigin==="anonymous"?l.credentials="omit":l.credentials="same-origin",l}function r(a){if(a.ep)return;a.ep=!0;const l=n(a);fetch(a.href,l)}})();var je={exports:{}},$={};/**
+ * @license React
+ * react-jsx-runtime.production.min.js
+ *
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */var cs=i,os=Symbol.for("react.element"),ds=Symbol.for("react.fragment"),ms=Object.prototype.hasOwnProperty,hs=cs.__SECRET_INTERNALS_DO_NOT_USE_OR_YOU_WILL_BE_FIRED.ReactCurrentOwner,xs={key:!0,ref:!0,__self:!0,__source:!0};function be(s,t,n){var r,a={},l=null,c=null;n!==void 0&&(l=""+n),t.key!==void 0&&(l=""+t.key),t.ref!==void 0&&(c=t.ref);for(r in t)ms.call(t,r)&&!xs.hasOwnProperty(r)&&(a[r]=t[r]);if(s&&s.defaultProps)for(r in t=s.defaultProps,t)a[r]===void 0&&(a[r]=t[r]);return{$$typeof:os,type:s,key:l,ref:c,props:a,_owner:hs.current}}$.Fragment=ds;$.jsx=be;$.jsxs=be;je.exports=$;var e=je.exports,Z={},de=es;Z.createRoot=de.createRoot,Z.hydrateRoot=de.hydrateRoot;function us({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3.75 6.75h16.5M3.75 12h16.5m-16.5 5.25h16.5"}))}const fs=i.forwardRef(us),we=fs;function ps({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9 12.75L11.25 15 15 9.75M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const gs=i.forwardRef(ps),js=gs;function bs({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M9.75 9.75l4.5 4.5m0-4.5l-4.5 4.5M21 12a9 9 0 11-18 0 9 9 0 0118 0z"}))}const ws=i.forwardRef(bs),vs=ws,ve=""+new URL("crfm-logo-74391ab8.png",import.meta.url).href,ye=""+new URL("helm-logo-simple-2ed5400b.png",import.meta.url).href;function ys({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M12 2.25a.75.75 0 01.75.75v11.69l3.22-3.22a.75.75 0 111.06 1.06l-4.5 4.5a.75.75 0 01-1.06 0l-4.5-4.5a.75.75 0 111.06-1.06l3.22 3.22V3a.75.75 0 01.75-.75zm-9 13.5a.75.75 0 01.75.75v2.25a1.5 1.5 0 001.5 1.5h13.5a1.5 1.5 0 001.5-1.5V16.5a.75.75 0 011.5 0v2.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V16.5a.75.75 0 01.75-.75z",clipRule:"evenodd"}))}const Ns=i.forwardRef(ys),As=Ns;function Es({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M15.75 2.25H21a.75.75 0 01.75.75v5.25a.75.75 0 01-1.5 0V4.81L8.03 17.03a.75.75 0 01-1.06-1.06L19.19 3.75h-3.44a.75.75 0 010-1.5zm-10.5 4.5a1.5 1.5 0 00-1.5 1.5v10.5a1.5 1.5 0 001.5 1.5h10.5a1.5 1.5 0 001.5-1.5V10.5a.75.75 0 011.5 0v8.25a3 3 0 01-3 3H5.25a3 3 0 01-3-3V8.25a3 3 0 013-3h8.25a.75.75 0 010 1.5H5.25z",clipRule:"evenodd"}))}const Ms=i.forwardRef(Es),Rs=Ms;function Is({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M15 3.75a.75.75 0 01.75-.75h4.5a.75.75 0 01.75.75v4.5a.75.75 0 01-1.5 0V5.56l-3.97 3.97a.75.75 0 11-1.06-1.06l3.97-3.97h-2.69a.75.75 0 01-.75-.75zm-12 0A.75.75 0 013.75 3h4.5a.75.75 0 010 1.5H5.56l3.97 3.97a.75.75 0 01-1.06 1.06L4.5 5.56v2.69a.75.75 0 01-1.5 0v-4.5zm11.47 11.78a.75.75 0 111.06-1.06l3.97 3.97v-2.69a.75.75 0 011.5 0v4.5a.75.75 0 01-.75.75h-4.5a.75.75 0 010-1.5h2.69l-3.97-3.97zm-4.94-1.06a.75.75 0 010 1.06L5.56 19.5h2.69a.75.75 0 010 1.5h-4.5a.75.75 0 01-.75-.75v-4.5a.75.75 0 011.5 0v2.69l3.97-3.97a.75.75 0 011.06 0z",clipRule:"evenodd"}))}const Ss=i.forwardRef(Is),Ls=Ss;function ks({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M12.53 16.28a.75.75 0 01-1.06 0l-7.5-7.5a.75.75 0 011.06-1.06L12 14.69l6.97-6.97a.75.75 0 111.06 1.06l-7.5 7.5z",clipRule:"evenodd"}))}const Cs=i.forwardRef(ks),Ne=Cs;function Ts({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M11.47 4.72a.75.75 0 011.06 0l3.75 3.75a.75.75 0 01-1.06 1.06L12 6.31 8.78 9.53a.75.75 0 01-1.06-1.06l3.75-3.75zm-3.75 9.75a.75.75 0 011.06 0L12 17.69l3.22-3.22a.75.75 0 111.06 1.06l-3.75 3.75a.75.75 0 01-1.06 0l-3.75-3.75a.75.75 0 010-1.06z",clipRule:"evenodd"}))}const Ps=i.forwardRef(Ts),Bs=Ps;function Ds({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 24 24",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M10.5 3.75a6.75 6.75 0 100 13.5 6.75 6.75 0 000-13.5zM2.25 10.5a8.25 8.25 0 1114.59 5.28l4.69 4.69a.75.75 0 11-1.06 1.06l-4.69-4.69A8.25 8.25 0 012.25 10.5z",clipRule:"evenodd"}))}const Hs=i.forwardRef(Ds),Us=Hs;function ne(s,t){return t?t==="home"?"https://crfm.stanford.edu/helm/":s?`https://crfm.stanford.edu/helm/${t}/${s}/`:`https://crfm.stanford.edu/helm/${t}/latest/`:"#"}function Ae(){const[s,t]=i.useState([]),[n,r]=i.useState();return i.useEffect(()=>{if(n&&n.title&&n.title!=="All Leaderboards"){const a=n.title==="Lite"||n.title==="Classic"?"HELM "+n.title:n.title;document.title=a+" - Holistic Evaluation of Language Models (HELM)"}},[n]),i.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(a=>a.json()).then(a=>{if(t(a),window.PROJECT_ID){const l=a.find(c=>c.id===window.PROJECT_ID);r(l)}else{const l=a.find(c=>c.id==="lite");r(l)}}).catch(a=>{console.error("Error fetching JSON:",a)})},[]),n===void 0||n.title===void 0?null:e.jsxs("div",{className:"dropdown z-50",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"btn normal-case bg-white font-bold p-2 border-0 text-lg block whitespace-nowrap z-40","aria-haspopup":"true","aria-controls":"menu",children:[n.title," ",e.jsx(Ne,{fill:"black",color:"black",className:"text w-4 h-4 inline"})]}),e.jsx("ul",{tabIndex:0,className:"-translate-x-36 dropdown-content z-[1] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:s.map((a,l)=>e.jsx("li",{className:"z-40",children:e.jsxs("a",{href:ne(void 0,a.id),className:"block",role:"menuitem",children:[e.jsx("strong",{className:n.title===a.title?"underline":"",children:a.title}),": ",a.description]})},l))})]})}function M(s){return s.startsWith("http://")||s.startsWith("https://")?s:`${window.BENCHMARK_OUTPUT_BASE_URL.replace(/\/$/,"")}/${s.replace(/^\//,"")}`}function W(){return window.RELEASE?`/releases/${window.RELEASE}`:`/runs/${window.SUITE}`}async function Os(s){try{return await(await fetch(M(`${W()}/summary.json`),{signal:s})).json()}catch(t){return console.log(t),{release:void 0,suites:void 0,suite:void 0,date:""}}}function Fs(){const[s,t]=i.useState({release:void 0,suites:void 0,suite:void 0,date:""}),[n,r]=i.useState();i.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(h=>h.json()).then(h=>{if(window.PROJECT_ID){const f=h.find(g=>g.id===window.PROJECT_ID);r(f)}else{const f=h.find(g=>g.id==="lite");r(f)}}).catch(h=>{console.error("Error fetching JSON:",h)})},[]),i.useEffect(()=>{const h=new AbortController;async function f(){const g=await Os(h.signal);t(g)}return f(),()=>h.abort()},[]);const a=n!==void 0&&n.releases!==void 0?n.releases:["v1.0.0"],l=s.release||s.suite||null;if(!l)return null;const c=`Release ${l} (${s.date})`;if(a.length<=1)return e.jsx("div",{children:c});const o=a.indexOf(l),d=o<0?e.jsx(U,{color:"blue",children:"preview"}):o===0?e.jsx(U,{color:"blue",children:"latest"}):e.jsx(U,{color:"yellow",children:"stale"});return e.jsxs("div",{className:"dropdown",children:[e.jsxs("div",{tabIndex:0,role:"button",className:"normal-case bg-white border-0 block whitespace-nowrap","aria-haspopup":"true","aria-controls":"menu",children:[c," ",d," ",e.jsx(Ne,{fill:"black",color:"black",className:"inline text w-4 h-4"})]}),e.jsx("ul",{tabIndex:0,className:"dropdown-content z-[50] menu p-1 shadow-lg bg-base-100 rounded-box w-max text-base",role:"menu",children:a.map(h=>e.jsx("li",{children:e.jsx("a",{href:ne(h,n?n.id:"lite"),className:"block",role:"menuitem",children:h})},h))})]})}function _s(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsxs("div",{className:"dropdown md:hidden mr-4",children:[e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(we,{className:"w-16 h-16"})}),e.jsxs("ul",{tabIndex:0,className:"menu menu-lg dropdown-content mt-3 z-[1] p-2 bg-base-100 shadow",children:[e.jsx("li",{children:e.jsx(j,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(j,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(j,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(j,{to:"runs",className:"whitespace-nowrap",children:"Predictions"})}),e.jsx("li",{children:e.jsx(j,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})})]})]})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx("a",{href:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:ve,className:"object-contain"})}),e.jsx(j,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:ye,className:"object-contain"})}),e.jsx(Ae,{})]}),e.jsx("div",{className:"flex-none hidden md:block",children:e.jsxs("ul",{className:"flex flex-row gap-6 px-1",children:[e.jsx("li",{children:e.jsx(j,{to:"leaderboard",children:"Leaderboard"})}),e.jsx("li",{children:e.jsx(j,{to:"models",children:"Models"})}),e.jsx("li",{children:e.jsx(j,{to:"scenarios",children:"Scenarios"})}),e.jsx("li",{children:e.jsx(j,{to:"runs",children:"Predictions"})}),e.jsx("li",{children:e.jsx(j,{to:"https://github.com/stanford-crfm/helm",children:"GitHub"})}),e.jsx("li",{className:"hidden lg:flex",children:e.jsx(Fs,{})})]})})]})}function Vs(){return e.jsxs("nav",{className:"navbar h-24 px-8 md:px-12 bg-base-100 max-w[1500]px",children:[e.jsx("div",{children:e.jsx("div",{className:"dropdown md:hidden mr-4",children:e.jsx("label",{tabIndex:0,className:"btn btn-ghost hover:bg-transparent btn-lg px-0",children:e.jsx(we,{className:"w-16 h-16"})})})}),e.jsxs("div",{className:"flex-1 items-center",children:[e.jsx(j,{to:"https://crfm.stanford.edu/",className:"w-24",children:e.jsx("img",{src:ve,className:"object-contain"})}),e.jsx(j,{to:"/",className:"mx-2 w-32",children:e.jsx("img",{src:ye,className:"object-contain"})}),e.jsx(Ae,{})]})]})}function zs(){return e.jsxs(e.Fragment,{children:[window.PROJECT_ID==="home"?e.jsx(Vs,{}):e.jsx(_s,{}),e.jsx("main",{className:"p-8 pt-0",children:e.jsx("div",{className:"mx-auto max-w-[1500]px",children:e.jsx(ss,{})})})]})}async function D(s){try{return await(await fetch(M(`${W()}/schema.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{adapter:[],metric_groups:[],metrics:[],models:[],perturbations:[],run_groups:[]}}}function Ws({href:s,children:t}){return e.jsx("a",{href:s,className:"link link-primary link-hover",target:"_blank",rel:"noreferrer",children:t})}function _({value:s}){return e.jsx("span",{children:e.jsx(ts,{components:{a:Ws},children:s})})}function P({title:s,subtitle:t,markdown:n=!1}){return e.jsxs("header",{className:"m-4 ml-0",children:[e.jsx("h1",{className:"text-4xl",children:s}),n&&t!==void 0?e.jsx("h2",{className:"mt-2 text-neutral",children:e.jsx(_,{value:t})}):t!==void 0&&e.jsx("h2",{className:"mt-2 text-neutral",children:t})]})}const qs={open:"green",limited:"yellow",closed:"red"},Gs={open:"Open",limited:"Limited",closed:"Closed"};function Ks({level:s}){return e.jsx(U,{color:qs[s],children:Gs[s]})}function k(){return e.jsx("div",{className:"w-full",children:e.jsx("div",{className:"block mx-auto my-24 loading loading-spinner loading-lg"})})}function Qs(){const[s,t]=i.useState([]);i.useEffect(()=>{const c=new AbortController;async function o(){const d=await D(c.signal);t(d.models)}return o(),()=>c.abort()},[]);const[n,r,a]=s.reduce((c,o)=>{switch(o.access){case"open":c[0]+=1;break;case"limited":c[1]+=1;break;case"closed":c[2]+=1;break}return c},[0,0,0]),l=Object.values(s.reduce((c,o)=>{const d=o.creator_organization;return c[d]===void 0?(c[d]={name:d,models:1},c):(c[d].models+=1,c)},{}));return s.length===0?e.jsx(k,{}):e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"Models"}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Creator"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Description"}),e.jsx("th",{children:"Access"})]})}),e.jsx("tbody",{children:s.map(c=>e.jsxs("tr",{children:[e.jsx("td",{className:"text-lg",children:c.creator_organization}),e.jsxs("td",{children:[e.jsx("span",{className:"text-xl",children:c.display_name}),e.jsx("br",{}),e.jsx("span",{children:c.name})]}),e.jsx("td",{children:e.jsx(_,{value:c.description})}),e.jsx("td",{children:e.jsx(Ks,{level:c.access})})]}))})]}),e.jsx(P,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-3 grid-cols-1 gap-8",children:[e.jsxs(z,{className:"flex flex-col justify-between",children:[e.jsx(X,{children:"Models"}),e.jsx(ge,{className:"text-6xl md:!text-[96px]",children:s.length}),e.jsx(ls,{values:[n,r,a],colors:["green","yellow","red"]}),e.jsx(ce,{categories:["Open","Limited","Closed"],colors:["green","yellow","red"]})]}),e.jsxs(z,{className:"md:col-span-2",children:[e.jsx(X,{children:"Creator Organizations"}),e.jsxs("div",{className:"flex justify-between mt-4",children:[e.jsx(is,{data:l,category:"models",index:"name",variant:"pie",className:"basis-5/12"}),e.jsx(ce,{categories:l.map(c=>c.name),className:"basis-7/12"})]})]})]})]})]})}function K({to:s,children:t,inTable:n=!1,title:r=""}){return n?e.jsx(j,{className:"link link-hover",to:s,title:r,children:t}):e.jsx(j,{className:"link link-primary link-hover",to:s,children:t})}function $s(){const[s,t]=i.useState([]);i.useEffect(()=>{const r=new AbortController;async function a(){const l=await D(r.signal);t(l.run_groups.filter(c=>!c.todo&&c.taxonomy&&!c.display_name.includes("CLEVA")))}return a(),()=>r.abort()},[]);const n=Object.values(s.reduce((r,a)=>{var c;const l=((c=a.taxonomy)==null?void 0:c.task)||"Unknown";return r[l]===void 0?(r[l]={name:l,value:1},r):(r[l].value+=1,r)},{}));return s.length===0?e.jsx(k,{}):(console.log(s),e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"Scenarios",subtitle:"A scenario represents a use case and consists of a dataset of instances."}),e.jsxs("div",{className:"overflow-x-auto mt-12",children:[e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Scenario"}),e.jsx("th",{children:"Task"}),e.jsx("th",{children:"What"}),e.jsx("th",{children:"Who"}),e.jsx("th",{children:"When"}),e.jsx("th",{children:"Language"}),e.jsx("th",{children:"Description"})]})}),e.jsx("tbody",{children:s.map(r=>{var a,l,c,o,d;return e.jsxs("tr",{children:[e.jsxs("td",{children:[e.jsx(K,{to:`/groups/${r.name}`,children:e.jsx("span",{className:"text-lg",children:r.display_name})}),e.jsx("span",{className:"block",children:r.name})]}),e.jsx("td",{children:((a=r.taxonomy)==null?void 0:a.task)||""}),e.jsx("td",{children:((l=r.taxonomy)==null?void 0:l.what)||""}),e.jsx("td",{children:((c=r.taxonomy)==null?void 0:c.who)||""}),e.jsx("td",{children:((o=r.taxonomy)==null?void 0:o.when)||""}),e.jsx("td",{children:((d=r.taxonomy)==null?void 0:d.language)||""}),e.jsx("td",{children:e.jsx(_,{value:r.description})})]})})})]}),e.jsx(P,{title:"Analysis"}),e.jsxs("div",{className:"grid md:grid-cols-4 gap-8",children:[e.jsxs(z,{className:"flex flex-col",children:[e.jsx(X,{children:"Total scenarios"}),e.jsx(ge,{className:"mx-auto my-6 md:mt-16 !text-[72px] md:!text-[96px]",children:s.length})]}),e.jsx(z,{className:"col-span-3",children:e.jsxs("div",{className:"grid md:grid-cols-2 gap-x-12",children:[e.jsx(oe,{data:n.slice(0,Math.floor(n.length/2))}),e.jsx(oe,{data:n.slice(Math.ceil(n.length/2))})]})})]})]})]}))}function Js(){return M(`${W()}/groups.json`)}async function Ee(s){try{return await(await fetch(Js(),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function re({children:s}){return e.jsx("div",{role:"navigation",className:"tabs flex-nowrap border-b-2 border-gray-2 overflow-x-auto overflow-y-hidden",children:s})}function Q({active:s=!1,onClick:t=()=>{},size:n="md",children:r}){return e.jsx("div",{onClick:t,className:`whitespace-nowrap text-${n} mb-[-2px] text-md tab tab-bordered${s?" border-2 border-grey-500 rounded":" border-none"}`,children:r})}function Ys({title:s,titleId:t,...n},r){return i.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",viewBox:"0 0 20 20",fill:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),s?i.createElement("title",{id:t},s):null,i.createElement("path",{fillRule:"evenodd",d:"M4.25 5.5a.75.75 0 00-.75.75v8.5c0 .414.336.75.75.75h8.5a.75.75 0 00.75-.75v-4a.75.75 0 011.5 0v4A2.25 2.25 0 0112.75 17h-8.5A2.25 2.25 0 012 14.75v-8.5A2.25 2.25 0 014.25 4h5a.75.75 0 010 1.5h-5z",clipRule:"evenodd"}),i.createElement("path",{fillRule:"evenodd",d:"M6.194 12.753a.75.75 0 001.06.053L16.5 4.44v2.81a.75.75 0 001.5 0v-4.5a.75.75 0 00-.75-.75h-4.5a.75.75 0 000 1.5h2.553l-9.056 8.194a.75.75 0 00-.053 1.06z",clipRule:"evenodd"}))}const Xs=i.forwardRef(Ys),me=Xs;function F(s){return Number.isNaN(Number(s))?String(s):String(Math.round(Number(s)*1e3)/1e3)}function he({value:s,title:t,hideIcon:n}){if(typeof s.value=="string"&&s.value.includes("⚠")&&(s.value=s.value.replace("⚠","")),s.value===void 0)return"-";if(s.run_spec_names){const r=(()=>{if(s.run_spec_names.length==1)return"/runs/"+s.run_spec_names[0];if(s.run_spec_names.length>1){const a="/runs/?q="+s.run_spec_names.map(c=>`^${c}$`).join("|");return encodeURI(a)}})();return r?e.jsx(K,{to:r,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center ",children:[F(s.value),!n&&e.jsx(me,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):t?e.jsx("a",{title:t,children:F(s.value)}):e.jsx(e.Fragment,{children:F(s.value)})}return s.href?e.jsx(K,{to:s.href,inTable:!0,title:t,children:e.jsxs("div",{className:"flex items-center",children:[F(s.value),!n&&e.jsx(me,{className:"w-3 h-3 ml-1",stroke:"#cbcbcb",fill:"#cbcbcb"})]})}):s.markdown?e.jsx(_,{value:String(s.value)}):t?e.jsx("a",{title:t,children:F(s.value)}):e.jsx(e.Fragment,{children:F(s.value)})}function ae({schema:s,groupTable:t,numRowsToDisplay:n,sortColumnIndex:r=1,sortable:a=!0,displayColumnIndexes:l=void 0,miniStyle:c=!1}){const[o,d]=i.useState(1),[h,f]=i.useState(r);function g(m){return m.length>30?m.substring(0,27)+"...":m}const y=m=>{const w=["AIRBench 2024 -","-book"];if(m.value==="Model/adapter")return"Model";if(w.some(x=>m.value.includes(x))){let x=m.value;return w.forEach(E=>{x=x.replace(E,"")}),g(x)}else return g(m.value)},b=m=>{if(s){const w=s.models.find(x=>x.display_name===m);if(w){let x=w.description;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},A=m=>{d(m===h?o*-1:m===0?-1:1),f(m)},p=m=>{if(s){const w=s.models.find(x=>x.display_name===m);if(w){let x=w.name;return x.includes("/")&&(x=x.replace("/","_")),x}}return""},L=()=>{const m=t.header[h].lower_is_better,w=o*(m?1:-1),x=t.rows.slice();return x.sort((E,R)=>{var I,T;const v=(I=E[h])==null?void 0:I.value,N=(T=R[h])==null?void 0:T.value;return v!==void 0&&N===void 0?-1:N!==void 0&&v===void 0?1:typeof v=="number"&&typeof N=="number"?(v-N)*w:typeof v=="string"&&typeof N=="string"?w===1?v.localeCompare(N):N.localeCompare(v):0}),n>0?x.slice(0,n):x};function u(m){const w=m.lastIndexOf(" - ");return w===-1?m:m.substring(0,w)+"*"+m.substring(w+1)}const C=m=>{const x=u(m).split("*")[0].trim();if(s){const E=s.run_groups.find(R=>R.display_name===x||R.short_display_name===x);if(E)return E.name}return""};return e.jsxs("table",{className:c?"table w-full":"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsx("tr",{children:t.header.filter((m,w)=>l===void 0||l.includes(w)).map((m,w)=>e.jsx("th",{className:`${w===h?"bg-gray-100":"bg-white"} ${w===0?"left-0 z-40":""} ${m.description?"underline decoration-dashed decoration-gray-300 ":""} whitespace-nowrap px-4 sticky top-0`,title:m.description?m.description:"",children:e.jsxs("div",{className:c?"flex gap-2 items-center":"z-20 flex justify-between items-center min-w-48 w-48 max-w-48 text-wrap",children:[e.jsx("span",{className:"inline-block w-full break-words",children:y(m)}),a?e.jsx("button",{className:"link",onClick:()=>A(w),children:e.jsx(Bs,{className:"w-6 h-6"})}):null]})},`$${w}`))})}),e.jsx("tbody",{children:L().map((m,w)=>e.jsx("tr",{children:m.filter((x,E)=>l===void 0||l.includes(E)).map((x,E)=>e.jsx("td",{className:`${E===0?"z-20 text-lg sticky left-0":"z-0"} ${w%2===0?"bg-gray-50":"bg-white"}`,children:E==1?e.jsx("div",{className:`${x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":""}`,children:e.jsx(he,{value:{...x,href:"/runs/?q="+p(String(m[0].value))},title:`Click value to see all predictions for: ${p(String(m[0].value))}`})}):e.jsx("div",{className:`${x&&x.style&&x.style["font-weight"]&&x.style["font-weight"]==="bold"?"font-bold":""} ${E===0?"underline decoration-dashed decoration-gray-300 z-10":"z-0"}`,children:e.jsx(he,{value:{...x},title:String(m[0].value)===x.value?b(String(m[0].value)):`Click value to see predictions for ${String(m[0].value)} for ${C(y(t.header[E]))}: ${p(String(m[0].value))}`})})},`${E}`))},`$${m[0].value}`))})]})}function Zs(){const[s,t]=i.useState(0),[n,r]=i.useState(),[a,l]=i.useState();return i.useEffect(()=>{const c=new AbortController;async function o(){const d=D(c.signal),h=Ee(c.signal),f=await d;l(f);const g=await h;r(g)}return o(),()=>c.abort()},[]),n===void 0||a===void 0?e.jsx(k,{}):n.length===0?e.jsxs("div",{children:[e.jsx(P,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsx("div",{children:"No groups found."})]}):e.jsxs("div",{children:[e.jsx(P,{title:"Results",subtitle:"Groupings of the processes, methods, and metrics involved in evaluating models, particularly in the context of natural language understanding and question answering.",className:"mb-16"}),e.jsxs("div",{children:[n.length>1?e.jsx(re,{children:n.map((c,o)=>e.jsx(Q,{active:o===s,onClick:()=>t(o),children:c.title},o))}):null,e.jsx(ae,{schema:a,groupTable:n[s],numRowsToDisplay:-1,sortColumnIndex:1,sortable:!0},`${s}`)]})]})}async function Me(s,t){try{return await(await fetch(M(`${W()}/groups/${s}.json`),{signal:t})).json()}catch(n){return n instanceof Error&&n.name!=="AbortError"&&console.log(n),[]}}function Re({schema:s,runGroupName:t,numRowsToDisplay:n=-1}){const[r,a]=i.useState(),[l,c]=i.useState(0);return i.useEffect(()=>{const o=new AbortController;async function d(){const h=await Me(t,o.signal);a(h)}return d(),()=>o.abort()},[s,t]),r===void 0||r.length===0?e.jsx(k,{}):r.length===0?e.jsx("div",{children:"Group currently has no tables."}):e.jsxs("div",{children:[r.length>1?e.jsx(re,{children:r.map((o,d)=>e.jsx(Q,{active:d===l,onClick:()=>c(d),children:o.title},d))}):null,e.jsx(ae,{schema:s,groupTable:r[l],numRowsToDisplay:n,sortColumnIndex:1},`${t}-${l}`)]})}function et(){const{groupName:s}=pe(),[t,n]=i.useState(void 0);i.useEffect(()=>{const l=new AbortController;async function c(){const d=await D(l.signal);n(d)}return c(),()=>l.abort()},[]);const a=(()=>{if(t!==void 0){for(const l of t.run_groups)if(l.name===s)return l}})();return t===void 0?e.jsx(k,{}):a===void 0?e.jsxs("div",{children:['Group "',s,'" not found.']}):e.jsxs(e.Fragment,{children:[e.jsx(P,{title:a.display_name,subtitle:a.description,markdown:!0,className:"mr-8"}),e.jsx(Re,{schema:t,runGroupName:a.name},a.name)]})}async function st(s){try{return await(await fetch(M(`${W()}/run_specs.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),[]}}function le({currentPage:s,totalPages:t,onNextPage:n,onPrevPage:r,className:a}){let l="join";return a!==void 0&&(l=`join ${a}`),e.jsxs("div",{className:l,children:[e.jsx("button",{onClick:r,className:"join-item btn",children:"«"}),e.jsxs("button",{className:"join-item btn",children:["Page ",s," of ",t]}),e.jsx("button",{onClick:n,className:"join-item btn",children:"»"})]})}const Y=100;function tt(){const[s,t]=ee(),[n,r]=i.useState(),[a,l]=i.useState(Number(s.get("page")||1)),[c,o]=i.useState(!0),[d,h]=i.useState(s.get("q")||"");i.useEffect(()=>{const p=new AbortController;async function L(){const u=await st(p.signal);r(u)}return L(),()=>p.abort()},[]);const f=p=>{p.preventDefault();const u=p.target.q.value;h(u),t({q:u,page:"1"})};if(n===void 0)return e.jsx(k,{});const g=c?new RegExp(d):null,y=n.filter(p=>g?g.test(p.name):p.name.includes(d)),b=y.slice((a-1)*Y,a*Y),A=Math.ceil(y.length/Y);return e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"Predictions",subtitle:"All benchmark predictions"}),e.jsxs("form",{className:"flex mb-8",onSubmit:f,children:[e.jsxs("div",{className:"form-control",children:[e.jsx("input",{type:"text",name:"q",placeholder:"Search",className:"input input-bordered",value:d,onChange:p=>h(p.target.value)}),e.jsxs("label",{className:"label",children:[e.jsxs("span",{className:"label-text-alt flex item-center",children:[e.jsx("input",{type:"checkbox",className:"toggle toggle-xs",checked:c,onChange:()=>o(!c)}),e.jsx("span",{className:"ml-2",children:"Regex"})]}),e.jsx("span",{className:"label-text-alt",children:`${y.length} results`})]})]}),e.jsx("div",{className:"form-control ml-4",children:e.jsx("button",{className:"btn",children:e.jsx(Us,{className:"w-6 h-6"})})})]}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Run"}),e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Groups"}),e.jsx("th",{children:"Adapter method"}),e.jsx("th",{children:"Subject / Task"})]})}),e.jsx("tbody",{children:b.map((p,L)=>e.jsxs("tr",{children:[e.jsx("td",{children:e.jsx(K,{to:`/runs/${p.name}`,children:p.name})}),e.jsx("td",{children:p.adapter_spec.model}),e.jsx("td",{children:p.groups.join(", ")}),e.jsx("td",{children:p.adapter_spec.method}),e.jsx("td",{children:p.scenario_spec.args.subject||p.scenario_spec.args.task||"-"})]},`${p.name}-${L}`))})]})}),A>0?e.jsx(le,{className:"flex justify-center my-8",onNextPage:()=>{const p=Math.min(a+1,A);l(p),s.set("page",String(p)),t(s)},onPrevPage:()=>{const p=Math.max(a-1,1);l(p),s.set("page",String(p)),t(s)},currentPage:a,totalPages:A}):e.jsx("div",{className:"my-8 text-center",children:"No results"})]})}function B(){return window.SUITE!==void 0?window.SUITE:void 0}async function nt(s,t,n){try{return await(await fetch(M(`/runs/${n||B()}/${s}/scenario.json`),{signal:t})).json()}catch(r){r instanceof Error&&r.name!=="AbortError"&&console.log(r);return}}function Ie(s,t){return M(`/runs/${t||B()}/${s}/run_spec.json`)}async function rt(s,t,n){try{return await(await fetch(Ie(s,n),{signal:t})).json()}catch(r){r instanceof Error&&r.name!=="AbortError"&&console.log(r);return}}function at(s,t){return M(`/runs/${t||B()}/${s}/scenario_state.json`)}function Se(){return window.RELEASE!==void 0?window.RELEASE:void 0}async function lt(s){try{return await(await fetch(M(`/releases/${Se()}/runs_to_run_suites.json`),{signal:s})).json()}catch(t){return t instanceof Error&&t.name!=="AbortError"&&console.log(t),{}}}function it(s,t){return Se()?s[t]:window.SUITE}function ct(s){const n={quasi_exact_match:!1,toxic_frac:!0,safety_score:!1,exact_match:!1},r=Object.keys(s);for(const a of r)if(s[a]!==void 0&&n[a]!==void 0)return n[a]?s[a]<.5?[a,!0]:[a,!1]:s[a]>=.5?[a,!0]:[a,!1];return["",!1]}function ot(s){const[t,n]=ct(s.stats);return t===""?null:n?e.jsx(dt,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`}):e.jsx(mt,{value:`${t.replace(/_/g," ")}: ${s.stats[t]}`})}function dt({value:s}){return e.jsx(U,{icon:js,color:"green",children:s})}function mt({value:s}){return e.jsx(U,{icon:vs,color:"red",children:s})}function O({value:s}){const[t,n]=i.useState(!1),[r,a]=i.useState(!1);return e.jsxs(e.Fragment,{children:[e.jsxs("div",{onMouseOver:()=>n(!0),onMouseOut:()=>n(!1),className:"relative",children:[e.jsx("div",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-[36rem] mb-2 whitespace-pre-wrap",children:s}),t?e.jsx("button",{className:"bg-white absolute p-2 leading-none height-fit min-h-none right-1 bottom-1 shadow",onClick:()=>a(!0),children:e.jsx(Ls,{fill:"black",color:"black",className:"text w-4 h-4"})}):null]}),e.jsx("dialog",{open:r,className:"modal p-16 bg-opacity-80 bg-white",onClick:()=>a(!1),children:e.jsx("div",{className:"modal-box max-w-none p-4 whitespace-pre-wrap bg-base-200",children:s})})]})}function Le({mediaObject:s}){if(s.content_type.includes("image")){if(s.location===void 0)return null;const t=M(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsxs("div",{children:[e.jsx("img",{src:t}),e.jsx("br",{})]})}else if(s.content_type.includes("audio")){if(s.location===void 0)return null;const t=M(s.location.replace("benchmark_output/","").replace("prod_env/","../"));return e.jsx("div",{children:e.jsx("audio",{controls:!0,src:t})})}else return s.text&&s.content_type&&s.content_type==="text/plain"&&s.text.length>1?e.jsxs("div",{children:[s.text,e.jsx("br",{}),e.jsx("br",{})]}):e.jsx("div",{})}function ke({multimediaObject:s}){return e.jsx("div",{children:s.media_objects.map(t=>e.jsx(Le,{mediaObject:t}))})}function ht(s){return Array.isArray(s)?s.length==0?"[]":`[${s.map(t=>String(t).replace(/\n/,"\\n")).join(", ")}]`:String(s)}function xt({request:s}){return e.jsxs("div",{children:[s.request.prompt.length>0?e.jsxs("div",{children:[e.jsxs("h3",{className:"block text text-gray-400",children:["Prompt (",s.request.prompt.length," Chars)"]}),e.jsx(O,{value:s.request.prompt})]}):s.request.multimodal_prompt?e.jsxs("div",{children:[e.jsx("h3",{className:"block text text-gray-400",children:"Prompt"}),e.jsx(ke,{multimediaObject:s.request.multimodal_prompt})]}):e.jsx("h3",{className:"block text text-gray-400",children:"Empty Prompt"}),e.jsx(se,{children:Object.keys(s.request).filter(t=>t!=="prompt").map((t,n)=>e.jsxs(te,{children:[e.jsxs("span",{children:[t,":"]}),s.request&&s.request[t]?e.jsx("span",{children:ht(s.request[t])}):"null"]},n+1))})]})}function ut(s){return e.jsx("div",{children:s.map((t,n)=>e.jsxs("div",{children:[t.error&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Error"}),e.jsx(O,{value:t.error})," "]}),t.text&&e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:"Text"}),e.jsx(O,{value:t.text})," "]}),t.media_object&&e.jsx(Le,{mediaObject:t.media_object})]},n))})}function ft(s){return e.jsx("div",{children:Object.entries(s).map(([t,n])=>e.jsxs("div",{children:[e.jsx("h3",{className:"ml-1",children:t}),e.jsx(O,{value:n===null?"null":n.toString()})]}))})}function pt({predictionAnnotations:s}){return e.jsx("div",{children:s&&s!==void 0?Object.entries(s).map(([t,n])=>e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white my-2",children:[e.jsx("summary",{className:"collapse-title",children:e.jsx(e.Fragment,{children:"View "+t+" annotations"})}),e.jsx("div",{className:"collapse-content",children:Array.isArray(n)?ut(n):ft(n)})]},t)):null})}function gt({predictions:s,requests:t,metricFieldMap:n}){return s.length<1?null:e.jsx("div",{children:e.jsx("div",{className:"flex flex-wrap justify-start items-start",children:s.map((r,a)=>e.jsxs("div",{className:"w-full",children:[s.length>1?e.jsxs("h2",{children:["Trial ",r.train_trial_index]}):null,e.jsx("div",{className:"mt-2 w-full",children:r.base64_images&&r.base64_images.length>0?e.jsxs(e.Fragment,{children:[e.jsx("h3",{className:"mr-4",children:"Prediction image"}),r.base64_images.map(l=>e.jsx("img",{src:"data:image;base64,"+l,alt:"Base64 Image"}))]}):e.jsxs(e.Fragment,{children:[e.jsxs("h3",{children:[e.jsx("span",{className:"mr-4",children:"Prediction raw text"}),e.jsx(ot,{stats:r.stats})]}),e.jsx(O,{value:r.predicted_text}),r.mapped_output?e.jsxs(e.Fragment,{children:[e.jsx("h3",{children:"Prediction mapped output"}),e.jsx(O,{value:String(r.mapped_output)})]}):null]})}),e.jsx(pt,{predictionAnnotations:r.annotations}),e.jsxs("div",{className:"mx-1",children:[e.jsx("h3",{children:"Metrics"}),e.jsx(se,{children:Object.keys(r.stats).map((l,c)=>e.jsxs(te,{children:[n[l]?e.jsx("span",{title:n[l].description,children:n[l].display_name}):e.jsx("span",{children:l}),e.jsx("span",{children:String(r.stats[l])})]},c))})]}),e.jsxs("details",{className:"collapse collapse-arrow border rounded-md bg-white",children:[e.jsx("summary",{className:"collapse-title",children:"Request details"}),e.jsx("div",{className:"collapse-content",children:e.jsx(xt,{request:t[a]})})]})]},a))})})}const jt="correct";function bt({references:s}){return e.jsxs("span",{children:[e.jsx("h3",{children:"References"}),e.jsx("ul",{children:s.map((t,n)=>e.jsxs("li",{className:"bg-base-200 p-2 block overflow-auto w-full max-h-72 mb-2 whitespace-pre-wrap",children:[t.output.text,t.tags.map(r=>e.jsx(U,{className:"mx-2",color:r===jt?"green":void 0,children:r}))]},n))})]})}function wt({instance:s,requests:t,predictions:n,metricFieldMap:r}){return e.jsxs("div",{children:[e.jsx("h3",{children:"Input"}),s.input.multimedia_content!==void 0?e.jsx(ke,{multimediaObject:s.input.multimedia_content}):s.input.text.includes('
0?e.jsx(bt,{references:s.references}):null}),e.jsx("div",{children:n&&t?e.jsx(gt,{predictions:n,requests:t,metricFieldMap:r}):null})]})}function J(s){return s.includes("gpqa")||s.includes("ewok")}async function xe(s,t,n,r){const a=h=>Uint8Array.from(atob(h),f=>f.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",a(t),"AES-GCM",!0,["decrypt"]),c=new Uint8Array([...a(s),...a(r)]),o=a(n),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:o},l,c);return new TextDecoder().decode(d)}async function vt(s,t,n,r){try{const l=await(await fetch(M(`/runs/${n||B()}/${s}/instances.json`),{signal:t})).json();if(J(s)&&r){const o=await(await fetch(M(`/runs/${n||B()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const h=o[d.input.text];h&&(d.input.text="encrypted",d.input.text=await xe(h.ciphertext,h.key,h.iv,h.tag));for(const f of d.references){const g=o[f.output.text];g&&(f.output.text=await xe(g.ciphertext,g.key,g.iv,g.tag))}}}return l}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}async function yt(s,t,n,r){const a=h=>Uint8Array.from(atob(h),f=>f.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",a(t),"AES-GCM",!0,["decrypt"]),c=new Uint8Array([...a(s),...a(r)]),o=a(n),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:o},l,c);return new TextDecoder().decode(d)}async function Nt(s,t,n,r){try{const l=await(await fetch(M(`/runs/${n||B()}/${s}/display_predictions.json`),{signal:t})).json();if(J(s)&&r){const o=await(await fetch(M(`/runs/${n||B()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const h=d.predicted_text,f=o[h];if(f)try{d.predicted_text=await yt(f.ciphertext,f.key,f.iv,f.tag)}catch(g){console.error(`Failed to decrypt predicted_text for instance_id: ${d.instance_id}`,g)}}}return l}catch(a){return a instanceof Error&&a.name==="AbortError"&&console.log(a),[]}}async function At(s,t,n,r){const a=h=>Uint8Array.from(atob(h),f=>f.charCodeAt(0)),l=await window.crypto.subtle.importKey("raw",a(t),"AES-GCM",!0,["decrypt"]),c=new Uint8Array([...a(s),...a(r)]),o=a(n),d=await window.crypto.subtle.decrypt({name:"AES-GCM",iv:o},l,c);return new TextDecoder().decode(d)}async function Et(s,t,n,r){try{const l=await(await fetch(M(`/runs/${n||B()}/${s}/display_requests.json`),{signal:t})).json();if(J(s)&&r){const o=await(await fetch(M(`/runs/${n||B()}/${s}/encryption_data.json`),{signal:t})).json();for(const d of l){const h=d.request.prompt,f=o[h];if(f)try{d.request.prompt=await At(f.ciphertext,f.key,f.iv,f.tag)}catch(g){console.error(`Failed to decrypt prompt for instance_id: ${d.instance_id}`,g)}}}return l}catch(a){return a instanceof Error&&a.name!=="AbortError"&&console.log(a),[]}}const q=10;function Mt({runName:s,suite:t,metricFieldMap:n,userAgreed:r}){const[a,l]=ee(),[c,o]=i.useState([]),[d,h]=i.useState(),[f,g]=i.useState(),[y,b]=i.useState(1);i.useEffect(()=>{const u=new AbortController;async function C(){const m=u.signal,[w,x,E]=await Promise.all([vt(s,m,t,r),Nt(s,m,t,r),Et(s,m,t,r)]);o(w);const R={};E.forEach(N=>{var V;const I=N.instance_id,T=((V=N.perturbation)==null?void 0:V.name)||"";R[I]===void 0&&(R[I]={}),R[I][T]===void 0&&(R[I][T]=[]),R[I][T].push(N)}),g(R);const v={};x.forEach(N=>{var V;const I=N.instance_id,T=((V=N.perturbation)==null?void 0:V.name)||"";v[I]===void 0&&(v[I]={}),v[I][T]===void 0&&(v[I][T]=[]),v[I][T].push(N)}),h(v)}return C(),()=>u.abort()},[s,t,r]);const A=c.slice((y-1)*q,(y-1)*q+q),p=Math.ceil(c.length/q);i.useEffect(()=>{const u=a.get("instance");if(u&&!window.helmHasScrolledToInstance&&A.length>0){if(A.findIndex(m=>m.id===u)===-1)return;requestAnimationFrame(()=>{const m=document.getElementById(`instance-${u}`);m&&m.scrollIntoView({behavior:"smooth"})}),window.helmHasScrolledToInstance=!0}},[a,y,l,A]);const L=u=>u.perturbation===void 0?`Instance id: ${u.id} [split: ${u.split}]`:`Instance id: ${u.id} [split: ${u.split}][perturbation: ${u.perturbation.name}]`;return d===void 0||f===void 0?e.jsx(k,{}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"grid gap-8",children:A.map((u,C)=>{var m,w;return e.jsxs("div",{id:"instance-"+u.id,className:"border p-4",children:[e.jsxs("div",{className:"flex items-center justify-between",children:[e.jsx("h3",{className:"text-xl mb-4",children:L(u)}),e.jsx("button",{className:"btn btn-sm normal-case px-2 py-1",onClick:()=>{const x=window.location.href+(window.location.href.includes("?")?"&instance=":"?instance=")+u.id;navigator.clipboard.writeText(x)},children:"Copy Link"})]}),e.jsx(wt,{instance:u,requests:f[u.id][((m=u.perturbation)==null?void 0:m.name)||""],predictions:d[u.id][((w=u.perturbation)==null?void 0:w.name)||""],metricFieldMap:n},`${u.id}-${C}`)]})})}),e.jsx(le,{className:"flex justify-center my-8",onNextPage:()=>{const u=Math.min(y+1,p);b(u),a.set("instancesPage",String(u)),l(a)},onPrevPage:()=>{const u=Math.max(y-1,1);b(u),a.set("instancesPage",String(u)),l(a)},currentPage:y,totalPages:p})]})}async function Rt(s,t,n){try{return await(await fetch(M(`/runs/${n||B()}/${s}/stats.json`),{signal:t})).json()}catch(r){return r instanceof Error&&r.name!=="AbortError"&&console.log(r),[]}}function It({stat:s,metricFieldMap:t}){const n=`${s.name.split!==void 0?` on ${s.name.split}`:""}${s.name.sub_split!==void 0?`/${s.name.sub_split}`:""}${s.name.perturbation!==void 0?` with ${s.name.perturbation.name}`:" original"}`;return t[s.name.name]?e.jsxs("span",{title:t[s.name.name].description,children:[e.jsx("strong",{children:t[s.name.name].display_name||s.name.name}),n]}):e.jsxs("span",{children:[e.jsx("strong",{children:s.name.name}),n]})}const G=50,ue=["name","mean","min","max","sum","sum_squared","variance","stddev"];function St({runName:s,suite:t,metricFieldMap:n}){const[r,a]=ee(),[l,c]=i.useState(),[o,d]=i.useState(1),[h,f]=i.useState("");if(i.useEffect(()=>{const b=new AbortController;async function A(){const p=b.signal,L=await Rt(s,p,t);c(L)}return A(),()=>b.abort()},[s,t]),l===void 0||l.length===0)return e.jsx(k,{});const g=Math.floor(l.length/G),y=l.slice((o-1)*G,(o-1)*G+G);return e.jsxs("div",{children:[e.jsx("div",{className:"flex justify-start my-4",children:e.jsx("input",{type:"text",className:"input input-bordered w-full max-w-xs",placeholder:"Search for a metric",onChange:b=>f(b.target.value)})}),e.jsx("div",{className:"overflow-x-auto",children:e.jsxs("table",{className:"table",children:[e.jsx("thead",{children:e.jsx("tr",{children:ue.map(b=>e.jsx("th",{children:b},b))})}),e.jsx("tbody",{children:y.filter(b=>!h||b.name.name.toLowerCase().includes(h.toLowerCase())).map(b=>e.jsx("tr",{children:ue.map(A=>{const p=b[A];return typeof p=="number"?e.jsx("td",{children:p}):e.jsx("td",{children:e.jsx(It,{stat:b,metricFieldMap:n})},A)})}))})]})}),e.jsx(le,{className:"flex justify-center my-8",onNextPage:()=>{const b=Math.min(o+1,g);d(b),r.set("metricsPage",String(b)),a(r)},onPrevPage:()=>{const b=Math.max(o-1,1);d(b),r.set("metricsPage",String(b)),a(r)},currentPage:o,totalPages:g})]})}function Lt({runName:s,onAgree:t}){const n=i.useRef(null),r=()=>{n.current!==null&&n.current.value.trim()==="Yes, I agree"?t():alert("Please type 'Yes, I agree' exactly.")},a=s.includes("gpqa")?e.jsx(kt,{}):s.includes("ewok")?e.jsx(Ct,{}):null;return e.jsxs("div",{className:"mb-8",children:[a,e.jsxs("p",{className:"mb-4",children:["If you agree to this condition, please type"," ",e.jsx("strong",{children:'"Yes, I agree"'})," in the box below and then click"," ",e.jsx("strong",{children:"Decrypt"}),"."]}),e.jsxs("div",{className:"flex gap-2 mt-2",children:[e.jsx("input",{type:"text",ref:n,className:"input input-bordered",placeholder:'Type "Yes, I agree"'}),e.jsx("button",{onClick:r,className:"btn btn-primary",children:"Decrypt"})]}),e.jsx("hr",{className:"my-4"})]})}function kt(){return e.jsxs("div",{children:[e.jsx("p",{className:"mb-4",children:"The GPQA dataset instances are encrypted by default to comply with the following request:"}),e.jsx("blockquote",{className:"italic border-l-4 border-gray-300 pl-4 text-gray-700 mb-4",children:"“We ask that you do not reveal examples from this dataset in plain text or images online, to minimize the risk of these instances being included in foundation model training corpora.”"})]})}function Ct(){return e.jsxs("div",{children:[e.jsx("p",{className:"mb-4",children:"The EWoK dataset instances are encrypted by default to comply with the following request:"}),e.jsx("blockquote",{className:"italic border-l-4 border-gray-300 pl-4 text-gray-700 mb-4",children:"“PLEASE DO NOT distribute any of the EWoK materials or derivatives publicly in plain-text! Any materials should appear in password-protected ZIP files or behind gated authentication mechanisms such as Huggingface datasets.”"})]})}function Tt(){const{runName:s}=pe(),[t,n]=i.useState(0),[r,a]=i.useState(),[l,c]=i.useState(),[o,d]=i.useState(),[h,f]=i.useState(),[g,y]=i.useState({}),[b,A]=i.useState({}),[p,L]=i.useState(!1);return i.useEffect(()=>{const u=new AbortController;async function C(){const m=u.signal;if(s===void 0)return()=>u.abort();const w=window.SUITE?window.SUITE:it(await lt(m),s);c(w);const[x,E,R]=await Promise.all([rt(s,m,w),nt(s,m,w),D(m)]);a(x),f(E),A(R.metrics.reduce((v,N)=>(v[N.name]=N,v),{})),y(R.adapter.reduce((v,N)=>(v[N.name]=N,v),{})),d(R.models.find(v=>v.name===(x==null?void 0:x.adapter_spec.model)))}return C(),()=>u.abort()},[s]),r===void 0||h===void 0||s===void 0||l===void 0||b===void 0?e.jsx(k,{}):e.jsxs(e.Fragment,{children:[e.jsx("div",{className:"flex justify-between gap-8 mb-12",children:e.jsxs("div",{children:[e.jsxs("h1",{className:"text-3xl flex items-center",children:[h.name,e.jsx("a",{href:"/#/groups/"+h.name,children:e.jsx(Rs,{className:"w-6 h-6 ml-2"})})]}),e.jsx("h3",{className:"text-xl",children:e.jsx(_,{value:h.description})}),e.jsx("h1",{className:"text-3xl mt-2",children:r.adapter_spec.model}),e.jsx("h3",{className:"text-xl",children:e.jsx(_,{value:(o==null?void 0:o.description)||""})}),e.jsx("div",{className:"mt-2 flex gap-2",children:h.tags.map(u=>e.jsx(U,{size:"xs",color:"gray",children:e.jsx("span",{className:"text text-md",children:u})}))})]})}),e.jsxs(z,{children:[e.jsxs("div",{className:"flex justify-between",children:[e.jsx("h3",{className:"text-lg mb-1",children:"Adapter Specification"}),e.jsxs("div",{className:"flex gap-2",children:[e.jsx(As,{className:"w-6 h-6 mr-1 text text-primary"}),e.jsx("a",{className:"link link-primary link-hover",href:Ie(r.name,l),download:"true",target:"_blank",children:"Spec JSON"}),e.jsx("a",{className:"link link-primary link-hover",href:at(r.name,l),download:"true",target:"_blank",children:"Full JSON"})]})]}),e.jsx("div",{children:e.jsx(se,{className:"grid md:grid-cols-2 lg:grid-cols-3 gap-x-8",children:Object.entries(r.adapter_spec).map(([u,C],m)=>e.jsxs(te,{className:m<3?"!border-0":"",children:[e.jsx("strong",{className:"mr-1",title:g[u]?g[u].description:void 0,children:`${u}: `}),e.jsx("span",{className:"overflow-x-auto",children:C})]}))})})]}),e.jsx("div",{className:"mt-16 mb-8",children:e.jsxs(re,{children:[e.jsx(Q,{size:"lg",active:t===0,onClick:()=>n(0),children:"Instances + Predictions"}),e.jsx(Q,{size:"lg",active:t===1,onClick:()=>n(1),children:"All metrics"})]})}),t===0&&J(s)&&!p&&e.jsx(Lt,{runName:s,onAgree:()=>L(!0)}),t===0?e.jsx(Mt,{runName:s,suite:l,metricFieldMap:b,userAgreed:p},p?"instances-agreed":"instances-not-agreed"):e.jsx(St,{runName:s,suite:l,metricFieldMap:b})]})}function Pt(){const[s,t]=i.useState(void 0),[n,r]=i.useState(void 0),[a,l]=i.useState(void 0);if(i.useEffect(()=>{const o=new AbortController;async function d(){const h=D(o.signal),f=Ee(o.signal),g=await h;t(g);const y=await f,b=[];y.forEach(A=>{A.rows.forEach(p=>{b.push({title:String(p[0].value),name:p[0].href.replace("?group=","")})})}),r(b)}return d(),()=>o.abort()},[]),s===void 0||n===void 0)return e.jsx(k,{});if(n.length===0)return e.jsxs(e.Fragment,{children:[e.jsx(P,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsx("div",{className:"divider"}),e.jsx("p",{className:"text-center mt-8",children:"Group currently has no results."})]});const c=a!==void 0?a:n[0].name;return e.jsxs(e.Fragment,{children:[e.jsxs("div",{className:"flex flex-row justify-between",children:[e.jsx(P,{title:"HELM Leaderboard",subtitle:"The HELM leaderboard shows how the various models perform across different scenarios and metrics.",markdown:!0}),e.jsxs("div",{className:"w-64 pt-8",children:[e.jsx("label",{htmlFor:"group",className:"block text-sm font-medium text-gray-700",children:"Select a group:"}),e.jsx("select",{id:"group",name:"group",onChange:o=>l(o.target.value),className:"mt-1 block w-full pl-3 pr-10 py-2 text-base border-gray-300 focus:outline-none focus:ring focus:border-blue-300 rounded-md",children:n.map((o,d)=>e.jsx("option",{value:o.name,children:o.title},d))})]})]}),e.jsx(Re,{schema:s,runGroupName:c},c)]})}const Bt=""+new URL("instruct-flowchart-48854f7c.svg",import.meta.url).href,Dt=""+new URL("instruct-graph-0a57d7d2.svg",import.meta.url).href;function Ht(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 font-bold text-center",children:"HELM Instruct: A Multidimensional Instruction Following Evaluation Framework with Absolute Ratings"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://crfm.stanford.edu/2024/02/18/helm-instruct.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{children:["We introduce ",e.jsx("em",{children:"HELM Instruct"}),", a multidimensional evaluation framework for instruction-following LLMs with absolute ratings. The framework takes an instruction, a model, an evaluator, and a criterion to generate a score. In our study, we use HELM Instruct to compare 4 instruction-following models on 7 scenarios based on 4 Human/LM evaluators and 5 criteria. Check out the blog post for more details."]}),e.jsxs("div",{className:"grid my-16 grid-cols-1 md:mx-32 md:grid-cols-2 md:gap-2",children:[e.jsx("img",{src:Bt,alt:"Evaluation flowchart",className:"mx-auto block",sizes:"100vw"}),e.jsx("img",{src:Dt,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block",sizes:"100vw"})]}),e.jsxs("table",{className:"rounded-lg shadow-md table",children:[e.jsx("thead",{children:e.jsxs("tr",{children:[e.jsx("th",{children:"Model"}),e.jsx("th",{children:"Average"}),e.jsx("th",{children:"Helpfulness"}),e.jsx("th",{children:"Understandability"}),e.jsx("th",{children:"Completeness"}),e.jsx("th",{children:"Conciseness"}),e.jsx("th",{children:"Harmlessness"})]})}),e.jsxs("tbody",{children:[e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-4-0314"}),e.jsx("td",{children:"4.63"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.85"}),e.jsx("td",{children:"4.50"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.95"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"openai_gpt-3.5-turbo-0613"}),e.jsx("td",{children:"4.60"}),e.jsx("td",{children:"4.34"}),e.jsx("td",{children:"4.86"}),e.jsx("td",{children:"4.42"}),e.jsx("td",{children:"4.41"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"anthropic_claude-v1.3"}),e.jsx("td",{children:"4.56"}),e.jsx("td",{children:"4.25"}),e.jsx("td",{children:"4.87"}),e.jsx("td",{children:"4.32"}),e.jsx("td",{children:"4.40"}),e.jsx("td",{children:"4.97"})]}),e.jsxs("tr",{children:[e.jsx("td",{children:"cohere_command-xlarge-beta"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"3.90"}),e.jsx("td",{children:"4.73"}),e.jsx("td",{children:"3.88"}),e.jsx("td",{children:"4.31"}),e.jsx("td",{children:"4.72"})]})]})]})]})}function Ce({models:s}){return e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[s.length," models"]}),e.jsx("ul",{children:s.map((t,n)=>t.todo?e.jsxs("li",{className:"text-slate-300 mt-1",children:[t.creator_organization," / ",t.display_name]},n):e.jsx(j,{to:"models",children:e.jsxs("li",{className:"text-black mt-1",children:[t.creator_organization," / ",t.display_name]},n)}))})]})}function Te({runGroups:s}){const t=new Map(s.filter(a=>a.metric_groups!==void 0&&(a.subgroups===void 0||a.subgroups.length===0)).map(a=>[a.name,a])),n=new Set,r=[];return s.forEach(a=>{const l=a.subgroups?a.subgroups:[],c=[];l.forEach(o=>{const d=t.get(o);d&&(c.push(d),n.add(d.name))}),c.length>0&&r.push([a,c])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," scenarios"]}),e.jsx("ul",{children:r.map(([a,l])=>e.jsxs("li",{className:"my-3",children:[e.jsx(j,{className:"text-black",to:"groups/"+a.name,children:e.jsx("h2",{children:a.display_name})}),e.jsx("ul",{className:"list-disc list-inside",children:l.map(c=>c.todo?e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name):e.jsx(j,{className:"text-black",to:"groups/"+c.name,children:e.jsx("li",{className:`${c.todo?"ml-4 text-slate-300":"ml-4"}`,children:c.display_name},c.name)}))})]},a.name))})]})}const Pe=""+new URL("helmhero-28e90f4d.png",import.meta.url).href;function S({runGroupName:s=void 0,tableIndexToDisplay:t=0,numRowsToDisplay:n=10,sortColumnIndex:r=1}){const[a,l]=i.useState(void 0),[c,o]=i.useState(void 0);return i.useEffect(()=>{const d=new AbortController;async function h(){const f=await D(d.signal);l(f);const g=f.run_groups;if(g.length===0)return;const y=s||g[0].name,b=await Me(y,d.signal);o(b[t])}return h(),()=>d.abort()},[s,t]),a===void 0||c===void 0?e.jsx(k,{}):e.jsx("div",{className:"rounded-2xl overflow-hidden border-2 bg-white p-1 mx-2 my-0 overflow-x-auto",style:{overflow:"auto",justifyContent:"space-between"},children:e.jsx(ae,{schema:a,groupTable:c,numRowsToDisplay:n,sortColumnIndex:r,displayColumnIndexes:[0,1],sortable:!1,miniStyle:!0})})}function Ut(){return e.jsxs("div",{className:"flex flex-col px-4 sm:px-6 py-100 sm:py-10 sm:mb-96 md:mb-96 lg:mb-0 xl:mb-0 2xl:mb-0",children:[e.jsx("div",{className:"flex flex-col text-center mb-10 justify-start",children:e.jsx("h1",{className:"text-3xl sm:text-4xl mb-3 sm:mb-4 mx-2 mt-2",children:e.jsx("strong",{children:"A holistic framework for evaluating foundation models."})})}),e.jsxs("div",{className:"flex flex-col md:flex-col lg:flex-row lg:justify-center",children:[e.jsx("div",{className:"w-full lg:w-1/2 flex justify-center mb-4 lg:mb-0 h-full py-10",children:e.jsx("img",{src:Pe,alt:"HELM Hero",className:"object-contain w-96"})}),e.jsxs("div",{className:"py-2 rounded-xl bg-gray-100 h-full",children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-2 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Be=""+new URL("ai21-0eb91ec3.png",import.meta.url).href,De=""+new URL("aisingapore-6dfc9acf.png",import.meta.url).href,He=""+new URL("aleph-alpha-7ce10034.png",import.meta.url).href,Ue=""+new URL("anthropic-70d8bc39.png",import.meta.url).href,Oe=""+new URL("bigscience-7f0400c0.png",import.meta.url).href,Fe=""+new URL("cohere-3550c6cb.png",import.meta.url).href,_e=""+new URL("eleutherai-b9451114.png",import.meta.url).href,Ve=""+new URL("google-06d997ad.png",import.meta.url).href,ie="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAoAAAAEBCAMAAADfF+TxAAAABGdBTUEAALGPC/xhBQAAACBjSFJNAAB6JgAAgIQAAPoAAACA6AAAdTAAAOpgAAA6mAAAF3CculE8AAAC0FBMVEUAAAAgcMIfcMEfcMEfccEfcMEfcMEfb8Efb8AfccAfcMEfbsEfbMEAAP8gcMIggL8fcMEfcMEfcMAfcMEeccMecMEecMIfcMIfcMAeccEndsQfbsEfcMAfb8EfccAeb8EXdLkgccEfcMEfcMEecMEhccEAgP8fb8EAVaoeb8IfcMIeccEaZswebsIfcMEgcMEac78fccIeacMfcMEfcMEecMEcccYrgNUfcMIeb8EfcMIfccEfcMEeb8AfcMIjdMUfcMEfcMEjcsEcccYecMAeccEfb8IfcMEgccIfcMEkbcgfb8IfccEfcMEdbb0fcMIkbcIfcMEba7wfcMIfcMEfcMEfcMEfb8IgccAfccAgcMEfcMEgcMEfb8EfccAfcMEfccAeb8MgcMEgc78ecMIfccEfcMEhb8IgccEfb8IfcMEecMEfcMIdcb8fcMEdbMQfcMEfcMEVar8fb8IfcMEgccIfcMAfcMEgcL8gcMEfcMEfcMEgccIicsMfcMIeb8AecMEfcMIfccEeb8AfcMIgccEgcMAgcL8gcL8fb8EfcMEfb8EfccEfcMEAgL8fcMEgb8Ifb8Igdb8gccIfcMIfcMEfb8Egb8EcccEfb8EeccAfcMEgccEfcMEdb8AgcMEgb8EzZswkbbYecMAgcr8hccIgcMEfccEgcMIgcMEfb8Edbr8fccEdb8Efb8Eccb0fb8AfcMEgccIfcMEfcMAhb8EecMEfcMEfcMAfcMIfcMAgcMEfcMEecMAfcMEid7sfcsAecb8ib8QfcMEfcMEfb8EfcMEeb78ecMEfcMEfcMAhb7wfcMAfcsEfccAfccEfcMEeb8AfcMAccb8fcMEdccAgcMIgcL8bbb8fb8EfcMEfcMEeb8EfccEgcMEecMAeb8EecMAeb8AebsMgccIeccQgcb8gcMEhb8QfcMIfccEecMEebcIfb8EfcMH////Dl82sAAAA7nRSTlMAcM/y7uPRvqeKa0ohAYkI8ciSWyLiVOCbVg06q/ezXguR9vCgRgLnA1zcbwpD6oAUmhH59Y8SBmR3S7zZVaIW+7YdCW1mU8Sq/g51zJ0jwxX6E31itN23YVGy5bG1o8t6TMooO9X8LoiF5tq7NP0ai/QMbN9YOaQgkO3kPyaWZbAyc36/eFlAEJXvjIN7BOlHnhhPpvPXVy2uTYRf1jWZZwUHnzg2ocV50qUsrE7HG47oaMCCPpjhco1JwttdlA9BRB5CdHy9PMnT2BevMWqcKUXQJOw9UDAcY63rh96phpd2bjNxK0i6JxlaqCrOBCMOsQAAAAFiS0dE77iw4qEAAAAHdElNRQfoBhEVHhJsM9kZAAAK9ElEQVR42u3d+Z/XRR3A8UEOS/gK2QqILCIL666AEATLEYfIsV/ABBQhFYSVgCXkKpCIU6UAFREKCwrkMlSSzMoslIRKDeUyCcUOOuyev6EUEWH3+92d2fl85j3T6/Wz85mZz/e5DxXmO6u0x+qpC7tIS6h+g4aNLv7Yxy9p3CSjHFQvlUVfqtzWtFkqy1YAzEvxE5d9siAMgJdn3AJsrgEopBYtr2glH6C+0i3A1gAUVGGbq9pKB3i1U3+t2gFQVkXtOxSLBnhNiUuApRqA4rq2YyfBAHVnlwDbAFBi13XpKhfgpxz669YdgDL7dI+eUgE2KHMHsJcGoNQa9C6TCVD3cQewLwAF95l+MgH2d+ZvwEAAiu76QRIB1r/BFcDBGoCyKxoiEKAe6grgMACKr7ynPIDljvwNygJQfg07iwNYONwNwBEagAGU7SINoC51A3AkAMPoxjJhANs48fdZDcBAummULICjx7gAeHOaAFt47JYqWx97a2CNq8VfzY1PcT2fcwHwthQXrIiIiIiIiIiIiIiIiIiIiIiIiIgo/G73WK8qqwnuQOr53VHtKx6f+jo62GqYkPpSOZLvtGo/+XqpL2Oipb+CbOpLBaDT7syIAGh7RcIkDcDAmywCoG5uB7ARAEMHWCEDYGsrf3dpAIYOUE8RAdDuioQuAAwfYLkIgHZXJHwegOEDnNpEBECbKxKmaQCGD1BPFwHQ5oqESgDGAHBGiQSAupc5wC8AMAaAVf8w2gvAmcb+7tYAjAJgfxEAZ802BTgHgHEAnDtPAkD9RUN/mS8BMA6Aer4IgDcZAlygARgJwHtEAMwuNAP4ZQDGAnBRiQSAeoSRv+KvADAWgBf+5l5PAEcaAVzs6V2pgR4bHCnAShEA9RITgEt9ASQiIiIiIiIiIiIiIiIiIiIiIiKi/4uWVVi0PKnVrLBZTcU8m6nuHWnbfaZT3T8yoe7sWdPUK5OaeuRXHX3kVpcmtUgK4NesDt6uspmqh/U539WmU61J7MhxTVckFDdM8SsXAEwJ4ANyAPatYeYEvwwCQG8AH5QDsKYrEpYCMEKAD8kBqAfnnXjtwwCMEOAQQQCH5Z14nQZghAAnCwKY/4qERwAYI8AVggDmvSKhbBEAYwS4XhLAfFckbNAAjBBgYYkkgPmuSPg6AGMEeJESBfDmnNN2+gYAYwQ4URbAjTmnfVQDMEaAK2UB1N/MNe23ABglwE3CAG7OMevwdgCMEWB2gDCAt2aqn/XbGoAxAvyOEgZQb6l+1usBGCXAreIAVn86p+toAEYJsFQcwNuLq5v0MR0GwLYDLBqTFMBtNqsZkLGZqlM3u9aaT7WqW7JlXO6vtm1XREREREREREREREREREREREREJKgNOyyak9RqlmyV207rXfVIY3nnLo3blcrr6OzoI4/hRHQqbbQ/gWl1Ijpr+s8XnJ3u8aSn4ki+h2ZNUekC/K7pgEkfzNa0yPRHaw0A5QNcrVIGuNv0Yo0nPphtqPHlHgCUD3DGqrQB1nvSdMRTZ2bbYzpuCQDFA8xOVqkD/J7dFQmtTC9EGKkAKB7gpSp9gMbXS565IuEq04lGAFA8wKe3eQCobrS6IsH0QoTsQgBKB3jdXuUDYAfTId9XFhciDFMAFA5w1jrlBWDJM6b/p1Si1A9M5xkMQOkA6/yqbT/g1aZjFiv1rOlP12wACgf4Q+UL4I9Mx/xYzZ5qOKSvAqBsgFuVN4CZ5wzHLCr7iek0ywAoG+Ada/0BVM+bDvppheGA0WMAKBrg2KbKI8BdpoN+ZnquYIICoGSAT25TPgGqfUlv8AUASgb4YrHyC7Ay4Q0W7gegYIDTHe3K/gOelvAO9yjvABf+3KKdSQHs9pKgdjn7sbKZ/cD7Qw8mu8WFdV0fERERERERERERERERERERERERCelgpUW/SGo1BdMFNanM0a5+afOOrzw3vqTS/eZecri+OsWJ6Nz9ap6bXRXV9cRxX/d7e9nl+gCYUK/0lAFwmfOdNSwGYAAA9auzRQDs1t31xp5XAAwBoP71IQkA1VjX+9oFwDAA6idWSQD4guNdPZcBYCAA9f3bBQA0vnfX6AunAJQMUL9W4h+g+c3j+XsdgOEA1JcJANjP6Y6uVQAMCKAe7x9g02YuN/QgAIMCWPi6d4DqHpcbegqAQQHU+/Z7B7jJ4XY2KgCGBVAvz/gGuG2Ru93cB8DQAOrDvgGqI+42cxSAwQGcW+Ab4DFnezmuABgcQD3ON0Dj3xqSs90ADBDgmctsPQJUL7rayRsADBHgbw55Bvimo41UKFEAp5VadCIpgKP6yW2J9a6usHnHd134lOLmbvbx24TWR0RERERERERERERERERERERERD7bctii0qRWM9tmNYetbjJdMMSuk+ZTvWWzqWnVP+vQEPsap7A+42I4EW11gVUPywPFFieBXZ44blpkfxZ6dwrrA2DSAN/2C1DtsPaXbQLACACeyvgF2MEa4DAFwAgAVvlad8oAS66xXfglAIwCYKlfgOoyy3UPPATAKAC+4xngAst1X68AGAXA2zwDVPtcf7MegEEB/J1vgJutll04HIBxAMw29QzwqNWyf68AGAdAXeAZoDpu87wTAIwF4Mu+Afa2eNya7QCMBWA/3wAHZc0f11IBMBaAHX0DVDeZP+4gAKMBeJ93gIeNnzajBIDRAPyDd4BdR5s+7XkFwGgA9vAOUE0wfdrdAIwH4B/9Ayw1fNg+JRbglNMWPZYUwEE2qzlt9fssO/e2rbHpVH+y2dSf8z1xnuGS16W8PiIiIiIiIiIiIiIiIiIiIiIiInLaBo9V/fbssX4xtOX8TRWk9/6qdKgWy+1QCyaNk9qD0h6remL2Ih1Dfzl/U/XSe39VGt6u5scsrQXAoqT2AMAE+qscgOptN9+iB2BAXZMRBLB5jU95LgPAuAC2V4IA1nxjea2+QgrAgHpUEkDVvqanPAXAuADObSsK4LoaHnJcATAugBOVKIAlz+R/yGkARgbwXVkAa7ixPLsQgHEBfHitMID5byx/VgEwLoCVShjA/DeWDwFgXABHnxQHMN+N5d3HADAugP2VOID5bixfrgAYF8At8gCqRrkf0Q+AcQGcqQQCXJnzCc1uAGBUALM7JQI8kPPG8h0KgFEBLFcSAea+sfwYAKMCOPWoTICHczyg6p9Zpg/wao8trrLPjtOD7q1qP7zF6b2/HHXNsd75tT44/05Se+A7CURERERERERERERERERERERERG56w2P7q6zmZEFoDa/5Fe9P7/3lrdVHln3AkEmTpPbAiei6Na4WAP2fiD7TR7+e2doQIEfyZXZ1iQoHoHr13NChAIwA4MDDtfrwxAB86MOR9W8AYPgAT9XyOxliABZ8+PXMxxUAgwf4t70qMIDqlbMj3wVg6ADbdcyo4ACeXcmpbQAMHGDF380/dv8A/zH1zMClCoBBA6y/cpsKEaC6+MzANwEYMsBZ/ddb/YtPAMBl74+7vBiAAQN8eqflf3kJANit+3vj5igABgvwlRNKhQtQtXlv3AoABgowO3OTxV+3SwL4z/8N26cAGCTAwpZ3WZ33kASw7VytuwAwQIBTh3XsanngSBJA9S+t/w3A0ADOGtZxvbJOFMAN+j8KgEEBvPXI/FF1OnIpCmDZohEADAXgrH2vbd5wqM5nfkUBVEcWygK4x2N9quzz/BtSl2416slyo8bOzNWe8kce2H3LsSXb3Rw675Pe+6tFe6328HhSe/gvDuj5ccZcNDsAAAAldEVYdGRhdGU6Y3JlYXRlADIwMjQtMDYtMTdUMjE6MzA6MTgrMDA6MDDt4fgHAAAAJXRFWHRkYXRlOm1vZGlmeQAyMDI0LTA2LTE3VDIxOjMwOjE4KzAwOjAwnLxAuwAAAABJRU5ErkJggg==",ze=""+new URL("meta-5580e9f1.png",import.meta.url).href,We=""+new URL("microsoft-f5ee5016.png",import.meta.url).href,qe=""+new URL("mistral-18e1be23.png",import.meta.url).href,Ge=""+new URL("nvidia-86fa75c1.png",import.meta.url).href,Ke=""+new URL("openai-3f8653e4.png",import.meta.url).href,Qe=""+new URL("tii-24de195c.png",import.meta.url).href,$e=""+new URL("together-a665a35b.png",import.meta.url).href,Je=""+new URL("tsinghua-keg-97d4b395.png",import.meta.url).href,Ye="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAASYAAABfCAYAAABFnmpnAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAlhSURBVHgB7d3xddtGEgbw7+np/8gVeFLBqYOwg/gqOKaC+CqIU0GcCqxUcOnAvAruUoEnFZxcwUUjEDYMA8QCmN0dgN/vPUggAHFpejGcXSyWQON/T8v/HZa3SOdR3vvEsg5O5dnyZuD5xfH5lywfzu/FL0/L8fx61njj+Lr63js9d87lPYZFfb1DDpVf0wd8rpOvMLNO3p5/Pz4td1jvm8TjPMoygvIU8ch5OXS2nZ6Wn8+/iUoTfK6Tr8/bHtDUSZ3645vz78kDE71IPM4rMKUS+FFswwHNJ9Y71AngRH1HNJmUZVEXY0AbmP6Ej5eJxwl8iPNxKR6xLUc0AUpAFINlUP/BhTrpnTGlZkKeGVNqMPSi2B4BgxPFIrhQJ70DkyQet+Wm3NYyppagqQil33uiMfK0/GtoRxuYPE+2lIov8PNtwjFeWdV/sW2Cpn1PFMU9Bq50e2dMJuXKnOentqCcrWZLXUd8efWOqLYf0TuPc2RMJTOYVAIfin34CURxWKLyurshR8YkCceUzphSjknhdfWytgPY10Sx/AOdOnnT2aHwced0TKqp7MuzrD005VpHEMVh5+mr9kE3MHmddKU7v6d4BibFfnwPoli+a1e6gekP+EjpPyrZlJvaP8eeMqZ7EMVyaFdyZExTt6UIfAnK2fpwgS77cGBwokgE56QlRx/TVMYk8DWVfQn87CljMgxMFI3Yj1p9TJ5KBSbF/jAwUTTPdbLGVTmBv0tZmlegVOxP6jQ1RKU8B6bbzgaFj7vzMpaBCcryCkwfUc7v+Pr9a/uEBH6YMVGqUnXy+cMyR2Bqn3wsMOVo6tlo87HBj16jzBXl/IrxCd4e0AxG81BjkOU/Z5R7hN+/9Tc0712KEn2JD0/Lv7EdrzF8jgmaG3G9PuTEftz2NnrNZGlX5sYCRY6TQS7s21tTziqIjUHy+HfVCExzrmwe4EcRazZPC0oP2D59Wn5AM7+SB7EfNwOFuD35iBz3yd0t3DeHIgb78Jhzcl9SIzDR/lh9VDjKFZhKBIrU5xT4iDRUwGswrCl9QzXtk1ez9KtxTMarg1cW7ltq7OTyDIIfEIeCKBbX4UZ7yZjGRpsL/ESaWSBS9kZkXOtkrsA0Nj4mV59G7vIURFRMPzB5RT2ZuT1XeXsNTKUn9iMq6WU/MHld7ZGR7bkypruZ2+cqObgyBZtyFI3CUa6MaSwgCPLIXZ4iFl7mp2gEfj4OBSaP4JQ7gxnyMmN5ilj2erWRyDzeDG2Ej5yBYsiLxNewhCIWZkwUjWudHApMCh9DgUKQjwxs83qzovXpCPxE6z+jbfobfDyfa0OByWu8jgxsyznK+C5x2xLRmjtelcCwI5083MOH2o+cTbkIE8YJfEQaXCnwu7nV6yosXbcDnFsnOZtyOQNFankeb1a0jMLzyyqZLZEHr6lpzPN9oDkzJknc5qXfTPSK4JGyCgtKR/jxvBmYro+dY7/At04+n2+3Yzsc9G8Tyd2063e2C3zU6By2ibeGZgv0fg9PIEpzGtiWo06OBiavjKkfKHIHplwZk6K8HP/hQ06gWgQ+fYV2IpdokgvyU1wITAof/UAhyOtu4vFSe+2HOYF9TDX9BJ/+QsF+/h9P7crNyAGK9XIFitTyBD4U+/QbiGL5VCfHApNHBJaJx976zZ8tN+VyU+xjvmnaD2vCndoHY4HJ62pNjkBxSbfDXeBjj82dn0EUy9+7D3JmTKYbKEoEphcjZa+xt0GI9tVQDyCKwz4otbshZx+T6U5CVmLSe+ms73Fw5VrWhn8NojgsKL3pb8ydMUlnvUTG5N10VOyHZUpHEMVgMca+/PTN0M7bkT9S+OgGB0F+MrK+1B4yphOaT6UTiGI4ofmSTB07oGRgYsZU3gMG2u9EFVmW9Dsm6mTuply3X6nkVTmBj0izCixxRDNlyzuUyViJpti9dZN18lJg8ghO7VUyQRlteRzD9KUjmu+WfwWiGI5Py3uMzON0c+EPPQJTmzEJymjLY2D6mr0ndnPwPYhiEDTBSfo7LgUmj/E7awLEksDYlifwscfBlRacSjSriVJYXXzX33gpMHlM97EmUCwZfS69ctdS7I+AY5kolgN6My1cCkyK9dr71wTzLc3Y2jLX8upnW8KuWjyguayaY+T59yCa5wGf66TC3xczLdxeOFDhw66ULQkUel4E83wDn6acoh7LaLpXBAXNQDSvKUzvz0uOoEf79EPvscC3Th7QxInROb9bXtnCt1gemJa8BivP4/aXSP1LiuYqxq/wcwDRcoqmTp7g59CulMiYLCgtCRQWGKyfae5VpD33L719Wn6ED8+vgKL5TvCpY7W/F9AG8B7gw85168YoFpiWBIulfTxL+7T6Ig6uVCxr3g7hsIG67IbqB2yfZ3fApwRmqinnNWGcYD7FsuAo2PftKF5NTA4ZIA9WHxU+pF25wXSha1kUnHsSPGJdxsTANE1AFIu0K1OBSbHe0mbc0vJdv0OdiMqbCkwe/SwHzKe937nLu/QaiKiwEk25JRnTR8fyl2LGRFRJiabcEnr+XWv0NQceElVUImNa4nFkvUb5RFTYVGCqlTnoyHqN8omosKgZk3bWawx03PrMlUSbFrWP6bHya2BTjqiiqcBkFOVpZ52d30TX4dO5nhKYanc+M3vJg+8rRTMrMCnK0t7jGtnLNWRMDEwUVkpgKt0RrL3HHC5AdGUiNuX688soylIQUVVbaMqNbStZPhEVFDEwPSZuy6X2jIBEV48Z09c4VICosoh9TDqwrWQWw45vospSA1PJk3WoLEU5CiKq6jbxOAsWHtPVptDEbbkwY6IS7EtHBX7eYkd1NzUwWb+LoIzand8fQJTfq/Pi5R12FJhSmnKmVB+PztyeA2cWIKosNTApytCZ20uVT0QFRQtMY5lZqRRVQUTVpQamCIHh0j4vHFxJFEC0jOlx4T4vCiKqbksZ0x+oWz4RFbKVzm/DjInoSqQGJqPI7zFw+URUyJzAVDtjKVE+B1cSBTAnMJXo46mdMV3T4EpmhxSNtiuRMiZduX+taztRGZgorEh9TDqxP/eJxHmYiIKI1sc0VX7O18DBlURBzAlMuTMKTTgmZ2BSEFEIkTKmlI7nnMGRfS5EQWypj8nkbG6xj4koiNSJ4lon5JswThOOseDxHfJYmzEpyvjTqayp+xIV6ynW8Xod7XOtpdgOz/duisLHp8TjLzZXn0YvVQJMAAAAAElFTkSuQmCC",Xe=""+new URL("yandex-38e09d70.png",import.meta.url).href,Ze=""+new URL("01-694cb9b7.png",import.meta.url).href,Ot=[Be,De,He,Ue,Oe,Fe,_e,Ve,ie,ze,We,qe,Ge,Ke,Qe,$e,Je,Ye,Xe,Ze];function fe(){const[s,t]=i.useState(void 0);return i.useEffect(()=>{const n=new AbortController;async function r(){const a=await D(n.signal);t(a)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(Ut,{}),e.jsxs("div",{className:"mx-auto text-lg px-16",children:[e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:Ot.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]}),e.jsx("div",{className:"container mx-auto",children:e.jsxs("div",{className:"grid grid-cols-1 sm:grid-cols-2 gap-8",children:[e.jsx(Ce,{models:s.models}),e.jsx(Te,{runGroups:s.run_groups})]})})]})]}):null}function Ft(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Massive Multitask Language Understanding (MMLU) on HELM"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Massive Multitask Language Understanding (MMLU)"})," ",e.jsx("a",{href:"https://arxiv.org/pdf/2009.03300.pdf",className:"link",children:"(Hendrycks et al, 2020)"})," ","is a multiple-choice question answering test that covers 57 tasks including elementary mathematics, US history, computer science, law, and more. We publish evaluation results from evaluating various models on MMLU using HELM. Our evaluation results include the following:"]}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"Simple, standardized prompts"}),e.jsx("li",{children:"Accuracy breakdown for each of the 57 subjects"}),e.jsx("li",{children:"Full transparency of all raw prompts and predictions"})]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/05/01/helm-mmlu.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(S,{})})]})]})}const _t=""+new URL("air-overview-d2e6c49f.png",import.meta.url).href;function Vt(){const s={fontVariant:"small-caps",fontWeight:"bold"},t=e.jsx("span",{style:s,children:"AIR-Bench 2024"});return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:t}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("img",{src:_t,alt:"AIR 2024 Categories",className:"mx-auto my-4 block w-3/4",sizes:"100vw"}),e.jsxs("p",{children:["We introduce ",t,", the first AI safety benchmark aligned with emerging government regulations and company policies, following the regulation-based safety categories grounded in our AI Risks study, AIR 2024. AIR 2024 decomposes 8 government regulations and 16 company policies into a four-tiered safety taxonomy with 314 granular risk categories in the lowest tier. ",t," contains 5,694 diverse prompts spanning these categories, with manual curation and human auditing to ensure quality. We evaluate leading language models on ",t,", uncovering insights into their alignment with specified safety concerns. By bridging the gap between public benchmarks and practical AI risks, ",t," ","provides a foundation for assessing model safety across jurisdictions, fostering the development of safer and more responsible AI systems."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2407.17436",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const zt=""+new URL("scb10x-204bd786.png",import.meta.url).href,Wt=""+new URL("scbx-71e53e72.jpg",import.meta.url).href;function qt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"ThaiExam"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://scbx.com/",children:e.jsx("img",{src:Wt,alt:"Logo",className:"inline h-32 mx-4 my-4"})}),e.jsx("a",{href:"https://scb10x.com/",children:e.jsx("img",{src:zt,alt:"Logo",className:"inline h-32 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.scbx.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCBX"})," ","and"," ",e.jsx("a",{href:"https://www.scb10x.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"SCB 10X"}),", we introduce the ThaiExam HELM leaderboard. ThaiExam is a Thai language benchmark based on examinations for high school students and investment professionals in Thailand. The ThaiExam leaderboard is the first public leaderboard for large language models on Thai language scenarios, and features evaluations of leading language models. Like all other HELM leaderboards, the ThaiExam leaderboard provides full prompt-level transparency, and the results can be fully reproduced using the HELM framework. We hope that this leaderboard will encourage further work in multilingual language model evaluation."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(j,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Gt=""+new URL("wellsfargo-a86a6c4a.png",import.meta.url).href;function Kt(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Finance"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{children:e.jsx("a",{href:"https://wellsfargo.com/",children:e.jsx("img",{src:Gt,alt:"Logo",className:"mx-auto block my-4 w-48"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.wellsfargo.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Wells Fargo"}),", we introduce the ",e.jsx("span",{className:"font-bold",children:"HELM Finance"})," ","leaderboard for ecologically-valid evaluations of leading language models in the financial domain. The leaderboard evaluates the ability of language models to perform tasks from financial professions on publicly financial documents across a range of scenarios."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(j,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const Qt=""+new URL("heim-logo-3e5e3aa4.png",import.meta.url).href;function $t({metricFieldMap:s,metricGroups:t}){const n=new Set,r=[];return t.forEach(a=>{const l=[];a.metrics.forEach(c=>{const o=s[c.name];o&&(l.push(o),n.add(o.name))}),l.length>0&&r.push([a,l])}),e.jsxs("section",{children:[e.jsxs("h3",{className:"text-3xl",children:[n.size," metrics"]}),e.jsx("ul",{children:r.map(([a,l])=>e.jsxs("li",{className:"my-3",children:[e.jsx("h4",{children:a.display_name}),e.jsx("ul",{className:"list-disc list-inside",children:l.map(c=>e.jsx("li",{className:"ml-4",children:c.display_name},c.name))})]},a.name))})]})}function Jt(){const[s,t]=i.useState(void 0);i.useEffect(()=>{const r=new AbortController;async function a(){const l=await D(r.signal);t(l)}return a(),()=>r.abort()},[]);const n=s?s.metrics.reduce((r,a)=>(r[a.name]=a,r),{}):void 0;return e.jsxs("div",{className:"container mx-auto px-16 text-base",children:[e.jsx("div",{className:"container max-w-screen-lg mx-auto",children:e.jsx("img",{className:"mx-auto w-96",src:Qt,alt:"HEIM Logo"})}),e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Holistic Evaluation of Text-To-Image Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2311.04287",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-2",children:["Significant effort has recently been made in developing text-to-image generation models, which take textual prompts as input and generate images. As these models are widely used in real-world applications, there is an urgent need to comprehensively understand their capabilities and risks. However, existing evaluations primarily focus on image-text alignment and image quality. To address this limitation, we introduce a new benchmark,"," ",e.jsx("strong",{children:"Holistic Evaluation of Text-To-Image Models (HEIM)"}),"."]}),e.jsx("p",{className:"my-2",children:"We identify 12 different aspects that are important in real-world model deployment, including:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside unreset",children:[e.jsx("li",{children:"image-text alignment"}),e.jsx("li",{children:"image quality"}),e.jsx("li",{children:"aesthetics"}),e.jsx("li",{children:"originality"}),e.jsx("li",{children:"reasoning"}),e.jsx("li",{children:"knowledge"}),e.jsx("li",{children:"bias"}),e.jsx("li",{children:"toxicity"}),e.jsx("li",{children:"fairness"}),e.jsx("li",{children:"robustness"}),e.jsx("li",{children:"multilinguality"}),e.jsx("li",{children:"efficiency"})]}),e.jsx("p",{className:"my-2",children:"By curating scenarios encompassing these aspects, we evaluate state-of-the-art text-to-image models using this benchmark. Unlike previous evaluations that focused on alignment and quality, HEIM significantly improves coverage by evaluating all models across all aspects. Our results reveal that no single model excels in all aspects, with different models demonstrating strengths in different aspects."}),e.jsx("p",{className:"my-2",children:"For full transparency, this website contains all the prompts, generated images and the results for the automated and human evaluation metrics."}),e.jsx("p",{className:"my-2",children:"Inspired by HELM, we decompose the model evaluation into four key components: aspect, scenario, adaptation, and metric:"}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:"https://crfm.stanford.edu/heim/latest/images/heim-main.png",alt:"HEIM scenarios, prompts, images and metrics"})}),s&&n?e.jsxs("div",{className:"grid grid-cols-1 md:grid-cols-3 gap-8",children:[e.jsx(Ce,{models:s.models}),e.jsx(Te,{runGroups:s.run_groups}),e.jsx($t,{metricFieldMap:n,metricGroups:s.metric_groups})]}):null]})}const Yt=""+new URL("vhelm-framework-a1ca3f3f.png",import.meta.url).href,Xt=""+new URL("vhelm-model-8afb7616.png",import.meta.url).href,Zt=""+new URL("vhelm-aspects-1437d673.png",import.meta.url).href;function en(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Holistic Evaluation of Vision-Language Models"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-8 md:gap-32 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.07112",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Leaderboard"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"})]}),e.jsxs("p",{className:"my-4",children:["Current benchmarks for assessing vision-language models (VLMs) often focus on their perception or problem-solving capabilities and neglect other critical aspects such as fairness, multilinguality, or toxicity. Furthermore, they differ in their evaluation procedures and the scope of the evaluation, making it difficult to compare models. To address these issues, we extend the HELM framework to VLMs to present the Holistic Evaluation of Vision Language Models (VHELM). To address these issues, we introduce VHELM, built on HELM for language models. VHELM aggregates various datasets to cover one or more of the 9 aspects:"," ",e.jsx("b",{children:"visual perception"}),", ",e.jsx("b",{children:"bias"}),", ",e.jsx("b",{children:"fairness"}),", ",e.jsx("b",{children:"knowledge"}),", ",e.jsx("b",{children:"multilinguality"}),", ",e.jsx("b",{children:"reasoning"}),", ",e.jsx("b",{children:"robustness"}),","," ",e.jsx("b",{children:"safety"}),", and ",e.jsx("b",{children:"toxicity"}),". In doing so, we produce a comprehensive, multi-dimensional view of the capabilities of the VLMs across these important factors. In addition, we standardize the standard inference parameters, methods of prompting, and evaluation metrics to enable fair comparisons across models. Our framework is designed to be lightweight and automatic so that evaluation runs are cheap and fast. For transparency, we release the raw model generations and complete results on this website."]}),e.jsx("p",{className:"my-4 font-bold",children:"VHELM is intended to be a living benchmark. We hope to continue adding new datasets, models and metrics over time, so please stay tuned!"}),e.jsxs("div",{className:"my-16 flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:Xt,alt:"A vision-lanuage model (VLM) takes in an image and a text prompt and generates text.",className:""}),e.jsx("img",{src:Yt,alt:"An example of an evaluation for an Aspect (Knowledge) - a Scenario (MMMU) undergoes Adaptation (multimodal multiple choice) for a Model (GPT-4 Omni), then Metrics (Exact match) are computed",className:""})]}),e.jsxs("div",{className:"flex-1",children:[e.jsx(S,{}),e.jsx(j,{to:"leaderboard",className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})]})]}),e.jsx("div",{className:"container max-w-screen-lg mx-auto my-8",children:e.jsx("img",{src:Zt,alt:"An example of each aspect in VHELM: Visual Perception, Bias, Fairness, Knowledge, Multilinguality, Reasoning, Robustness, Toxicity Mitigation and Safety. ",className:""})})]})}const sn=""+new URL("accenture-6f97eeda.png",import.meta.url).href,tn=""+new URL("cresta-9e22b983.png",import.meta.url).href;function nn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Call Center"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("div",{className:"text-center",children:[e.jsx("a",{href:"https://www.accenture.com/",children:e.jsx("img",{src:sn,alt:"Logo",className:"inline h-12 mx-4 my-4"})}),e.jsx("a",{href:"https://www.cresta.com/",children:e.jsx("img",{src:tn,alt:"Logo",className:"inline h-8 mx-4 my-4"})})]}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://www.accenture.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Accenture"})," ","and"," ",e.jsx("a",{href:"https://www.cresta.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"Cresta"}),", we introduce the HELM"," ",e.jsx("span",{className:"font-bold",children:"Call Center"})," leaderboard. HELM Call Center is a leaderboard consisting of evaluations of leading language models on scenarios with realistic tasks from the call center context."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(j,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const rn=""+new URL("cuhk-8c5631e9.png",import.meta.url).href;function an(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Chinese Language Models EVAluation Platform (CLEVA)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.cuhk.edu.hk/",children:e.jsx("img",{src:rn,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with the"," ",e.jsx("a",{href:"https://lwwangcse.github.io/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"LaVi Lab"})," ","team from"," ",e.jsx("a",{href:"https://www.cuhk.edu.hk/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"The Chinese University of Hong Kong (CUHK)"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"Chinese Language Models EVAluation Platform (CLEVA)"})," ","leaderboard on HELM. CLEVA is a comprehensive Chinese-language benchmark for holistic evaluation of Chinese-language LLMs, and employs a standardized workflow to assess LLMs' performance across various dimensions."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2308.04813",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function ln(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Tables"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("div",{className:"text-center",children:e.jsx("a",{href:"https://www.ibm.com/",children:e.jsx("img",{src:ie,alt:"Logo",className:"inline h-12 mx-4 my-4"})})}),e.jsxs("p",{children:["In collaboration with"," ",e.jsx("a",{href:"https://research.ibm.com/",className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",children:"IBM Research"}),", we introduce the"," ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," leaderboard on HELM. ",e.jsx("strong",{className:"font-bold",children:"HELM Tables"})," is a holistic evaluation of leading language models that tests their capability to understand, process and analyze structured tabular input data."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(j,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const cn=({id:s,title:t,text:n})=>((t==="Classic"||t==="Lite"||t==="Instruct")&&(t="HELM "+t),e.jsx("div",{className:"max-w-sm rounded overflow-hidden bg-gray-100 hover:scale-105 transition-transform duration-300",children:e.jsx("a",{href:ne(void 0,s),children:e.jsxs("div",{className:"px-6 py-4",children:[e.jsxs("div",{className:"font-bold text-xl mb-2",children:[e.jsx("div",{className:"py-3",children:e.jsx("svg",{fill:"#000000",width:"20px",height:"20px",viewBox:"0 0 24 24",xmlns:"http://www.w3.org/2000/svg",children:e.jsx("path",{d:"M22,7H16.333V4a1,1,0,0,0-1-1H8.667a1,1,0,0,0-1,1v7H2a1,1,0,0,0-1,1v8a1,1,0,0,0,1,1H22a1,1,0,0,0,1-1V8A1,1,0,0,0,22,7ZM7.667,19H3V13H7.667Zm6.666,0H9.667V5h4.666ZM21,19H16.333V9H21Z"})})}),t+" →"]}),e.jsx("p",{className:"text-gray-700 text-base",children:n})]})})}));function on(){const[s,t]=i.useState();return i.useEffect(()=>{fetch("https://raw.githubusercontent.com/stanford-crfm/helm/main/helm-frontend/project_metadata.json").then(n=>n.json()).then(n=>{t(n)}).catch(n=>{console.error("Error fetching JSON:",n)})},[]),e.jsx("div",{className:"p-10 mb-20",children:e.jsx("div",{className:"grid grid-cols-2 lg:grid-cols-3 gap-4",children:s&&s.map((n,r)=>n.id==="home"?null:e.jsx(cn,{id:n.id,title:n.title,text:n.description},r))})})}function dn(){return e.jsxs("div",{className:"flex flex-col md:flex-row px-6 py-32",children:[e.jsxs("div",{className:"flex-1 p-4 flex flex-col justify-center",children:[e.jsx("div",{className:"flex justify-start",children:e.jsxs("div",{children:[e.jsx("h1",{className:"text-4xl mb-4 mx-4 mt-2",children:e.jsx("strong",{children:"A reproducible and transparent framework for evaluating foundation models."})}),e.jsx("h3",{className:`text-xl
+ mb-4 mx-4 mt-2`,children:"Find leaderboards with many scenarios, metrics, and models with support for multimodality and model-graded evaluation."})]})}),e.jsxs("div",{className:"flex flex-col md:flex-row justify-start mt-6 ml-4",children:[e.jsx("button",{className:"px-6 btn btn-grey rounded-md mb-4 md:mb-0",onClick:()=>window.scrollTo({top:760,behavior:"smooth"}),children:e.jsx("div",{children:"Leaderboards ↓"})}),e.jsx("button",{className:"px-6 btn btn-grey rounded-md md:ml-4",children:e.jsx("a",{href:"https://github.com/stanford-crfm/helm",children:"Github"})})]})]}),e.jsx("div",{className:"mx-4 mt-6 md:mt-0 md:w-1/3",children:e.jsx("img",{src:Pe,alt:"HELM Hero",className:"object-cover w-full h-full"})})]})}const mn=[Be,De,He,Ue,Oe,Fe,_e,Ve,ie,ze,We,qe,Ge,Ke,Qe,$e,Je,Ye,Xe,Ze];function hn(){const[s,t]=i.useState(void 0);return i.useEffect(()=>{const n=new AbortController;async function r(){const a=await D(n.signal);t(a)}return r(),()=>n.abort()},[]),s?e.jsxs(e.Fragment,{children:[e.jsx(dn,{}),e.jsx("div",{className:"container py-5 mx-auto text-lg",children:e.jsx("div",{className:"flex flex-col sm:flex-row justify-center mb-10 flex sm:gap-8 md:gap-32",children:e.jsx("h1",{className:"text-4xl mx-4 ",children:e.jsx("strong",{children:"HELM Leaderboards"})})})}),e.jsx(on,{}),e.jsx("div",{className:"mx-auto text-lg px-16",children:e.jsxs("div",{className:"container mb-12 mx-auto text-lg px-16",children:[e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center mt-10 mb-10 flex gap-2 sm:gap-8 md:gap-32",children:[" ",e.jsx("h1",{className:"text-4xl mx-4 mt-40",children:e.jsx("strong",{children:"Our Partners"})})]}),e.jsx("ol",{className:"my-8 flex flex-col gap-32",children:e.jsx("li",{children:e.jsx("div",{className:"flex flex-wrap justify-center max-w-[1100px] mx-auto w-auto",children:mn.map((n,r)=>e.jsx("div",{className:"w-24 h-24 flex items-center m-6",children:e.jsx("img",{src:n,alt:"Logo",className:"mx-auto block",sizes:"100vw"})},r))})})})]})})]}):null}const xn=""+new URL("overview-74aea3d8.png",import.meta.url).href,un=""+new URL("process-flow-bd2eba96.png",import.meta.url).href;function fn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"Image2Struct: A Benchmark for Evaluating Vision-Language Models in Extracting Structured Information from Images"}),e.jsxs("div",{className:"flex flex-col sm:flex-row justify-center gap-2 sm:gap-2 md:gap-8 my-8",children:[e.jsx("a",{className:"px-10 btn rounded-md",href:"https://arxiv.org/abs/2410.22456",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://github.com/stanford-crfm/helm",children:"Github"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-latex",children:"Latex dataset"}),e.jsx("a",{className:"px-5 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-webpage",children:"Webpage dataset"}),e.jsx("a",{className:"px-10 btn rounded-md",href:"https://huggingface.co/datasets/stanford-crfm/i2s-musicsheet",children:"Music sheet dataset"})]}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsxs("p",{children:[e.jsx("strong",{children:"Image2Struct"})," is a benchmark for evaluating vision-language models in practical tasks of extracting structured information from images."]}),e.jsx("br",{}),e.jsx("p",{children:"In our tasks, VLMs are prompted to generate the underlying structured information (i.e., code) from an input image. The code can be compiled, and the output image is evaluated against the input image to produce a score. This round-trip evaluation allows us to quantitatively evaluate VLMs on complex tasks with multiple correct answers. We create a pipeline that downloads fresh, user-submitted data from active online communities upon execution, evaluates the VLMs shortly, and produces a leaderboard."}),e.jsx("br",{}),e.jsx("img",{src:xn,alt:"Evaluation flowchart",className:"mx-auto block w-full",sizes:"100vw"}),e.jsx("br",{}),e.jsx("p",{children:"We introduce 3 tasks:"}),e.jsxs("ul",{className:"my-2 list-disc list-inside",children:[e.jsx("li",{children:"LaTex: equations, tables, plots and algorithms from ArXiV papers."}),e.jsx("li",{children:"Webpages: pages from GitHub written in HTML, CSS and Javascript. ..."}),e.jsx("li",{children:"Music sheets: crops of measures from IMSLP music sheets."})]}),e.jsx("div",{className:"flex flex-row justify-center mt-8",children:e.jsx("a",{className:"px-10 btn rounded-md",href:"#/leaderboard",children:"Full Leaderboard"})})]}),e.jsx("div",{className:"flex-1",children:e.jsx(S,{numRowsToDisplay:12})})]}),e.jsx("br",{}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("p",{children:"We provide an automated process for collecting new fresh data from online communities, evaluating the models and producing a leaderboard. The pipeline is designed to be executed on a regular basis to keep the leaderboard up-to-date."}),e.jsx("br",{}),e.jsxs("p",{children:["In addition to the automated data collection, we also provide a"," ",e.jsx("i",{children:"wild"})," subset for the LaTeX and webpage tasks that are collected from Wikipedia and various popular websites. These instances do not have a corresponding code, and the evaluation is done by our proposed metric: block EMD (Earth Mover Distance)."]})]}),e.jsx("div",{className:"flex-1",children:e.jsx("img",{src:un,alt:"7 scenarios, 4 models, 4 evaluators and 5 criteria",className:"mx-auto block w-full",sizes:"200vw"})})]})]})}function pn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"Elements of World Knowledge (EWoK)"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsxs("p",{children:["We present the"," ",e.jsx("a",{className:"font-bold underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2405.09605",children:"Elements of World Knowledge (EWoK)"})," ","leaderboard in collaboration with the EWoK team. EWoK is a benchmark for evaluating world modeling in language models by testing their ability to use knowledge of a concept to match a target text with a plausible/implausible context. EWoK targets specific concepts from multiple knowledge domains known to be vital for world modeling in humans, including social interactions and spatial relations."]}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://arxiv.org/abs/2405.09605",children:"Paper"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function gn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Medical"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{className:"my-2",children:"With the increasing scale and impact of language models, there has also been interest interest in using language models in the medical domain. However, the capabilities and risks of these models are not well-understood, and there is significant potential for harm in the medical setting."}),e.jsxs("p",{className:"my-2",children:["To address this, we present the"," ",e.jsx("a",{className:"font-bold",href:"https://arxiv.org/abs/2405.09605",children:"HELM Medical"})," ","leaderboard for evaluation of language models in the medical domain. The HELM Medical leaderboard presents evaluations of leading general-purpose language models as well as language models fine-tuned on the medical domain. These models are evaluated on a range of medical tasks based on the benchmarks used in"," ",e.jsx("a",{className:"underline text-blue-600 hover:text-blue-800 visited:text-purple-600",href:"https://arxiv.org/abs/2212.13138",children:"Singhal et al. 2022"}),". We hope that this leaderboard encourages further work in evaluating language models on tasks from the medical domain."]}),e.jsx("div",{className:"flex flex-row justify-center my-4",children:e.jsx(j,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Full Leaderboard"})})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}const jn=""+new URL("helm-safety-2907a7b6.png",import.meta.url).href;function bn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl mt-16 my-8 font-bold text-center",children:"HELM Safety"}),e.jsxs("div",{className:"flex flex-col lg:flex-row items-center gap-8",children:[e.jsxs("div",{className:"flex-1 text-xl",children:[e.jsx("img",{src:jn,alt:"Logo",className:"mx-auto p-0 block",style:{width:"300px"}}),e.jsx("p",{children:"Language models demonstrate powerful capabilities and pose significant risks. Given their widespread deployment, standardized public benchmarking of such models is vital. While language models are routinely evaluated on standard capability benchmarks, comparable standardization for benchmarking safety risks lags behind. To address this gap, we introduce HELM-Safety as a collection of 5 safety benchmarks that span 6 risk categories (e.g. violence, fraud, discrimination, sexual, harassment, deception). We present evaluation results for recent leading open weights and closed models."}),e.jsxs("div",{className:"flex flex-row justify-center mt-4",children:[e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"https://crfm.stanford.edu/2024/11/08/helm-safety.html",children:"Blog Post"}),e.jsx("a",{className:"px-10 btn rounded-md mx-4",href:"#/leaderboard",children:"Full Leaderboard"})]})]}),e.jsx("div",{className:"flex-1",children:e.jsx(S,{})})]})]})}function wn(){return e.jsxs("div",{className:"container mx-auto px-16",children:[e.jsx("h1",{className:"text-3xl my-8 font-bold text-center",children:"HELM Capabilities"}),e.jsxs("div",{className:"flex flex-col lg:flex-row gap-8",children:[e.jsxs("div",{className:"flex-1 text-l",children:[e.jsx("p",{children:"HELM Capabilities is a new leaderboard for benchmarking the capabilities of foundation models, featuring 6 challenging scenarios."}),e.jsxs("div",{className:"flex flex-row justify-center my-4",children:[e.jsx(j,{to:"#",className:"px-10 btn rounded-md mx-4",children:"Blog Post"}),e.jsx(j,{to:"leaderboard",className:"px-10 btn rounded-md mx-4",children:"Leaderboard"})]})]}),e.jsxs("div",{className:"py-2 pb-6 rounded-3xl bg-gray-100 h-full",style:{maxWidth:"100%"},children:[e.jsx(S,{}),e.jsx("div",{className:"flex justify-end",children:e.jsx(j,{to:"leaderboard",children:e.jsx("button",{className:"px-4 mx-3 mt-1 btn bg-white rounded-md",children:e.jsx("span",{children:"See More"})})})})]})]})]})}function vn(){return window.PROJECT_ID==="lite"?e.jsx(fe,{}):window.PROJECT_ID==="instruct"?e.jsx(Ht,{}):window.PROJECT_ID==="image2struct"?e.jsx(fn,{}):window.PROJECT_ID==="heim"?e.jsx(Jt,{}):window.PROJECT_ID==="mmlu"?e.jsx(Ft,{}):window.PROJECT_ID==="vhelm"?e.jsx(en,{}):window.PROJECT_ID==="air-bench"?e.jsx(Vt,{}):window.PROJECT_ID==="thaiexam"?e.jsx(qt,{}):window.PROJECT_ID==="finance"?e.jsx(Kt,{}):window.PROJECT_ID==="call-center"?e.jsx(nn,{}):window.PROJECT_ID==="cleva"?e.jsx(an,{}):window.PROJECT_ID==="tables"?e.jsx(ln,{}):window.PROJECT_ID==="ewok"?e.jsx(pn,{}):window.PROJECT_ID==="medical"?e.jsx(gn,{}):window.PROJECT_ID==="safety"?e.jsx(bn,{}):window.PROJECT_ID==="capabilities"?e.jsx(wn,{}):window.PROJECT_ID==="home"?e.jsx(hn,{}):e.jsx(fe,{})}function yn(){return e.jsx(ns,{children:e.jsx(rs,{children:e.jsxs(H,{path:"/",element:e.jsx(zs,{}),children:[e.jsx(H,{index:!0,element:e.jsx(vn,{})}),e.jsx(H,{path:"leaderboard",element:e.jsx(Pt,{})}),e.jsx(H,{path:"models",element:e.jsx(Qs,{})}),e.jsx(H,{path:"scenarios",element:e.jsx($s,{})}),e.jsx(H,{path:"groups",element:e.jsx(Zs,{})}),e.jsx(H,{path:"groups/:groupName",element:e.jsx(et,{})}),e.jsx(H,{path:"runs",element:e.jsx(tt,{})}),e.jsx(H,{path:"runs/:runName",element:e.jsx(Tt,{})})]})})})}Z.createRoot(document.getElementById("root")).render(e.jsx(as.StrictMode,{children:e.jsx(yn,{})}));
diff --git a/src/helm/benchmark/static_build/index.html b/src/helm/benchmark/static_build/index.html
index 63214568b92..d8c8dbb7e4f 100644
--- a/src/helm/benchmark/static_build/index.html
+++ b/src/helm/benchmark/static_build/index.html
@@ -7,7 +7,7 @@