diff --git a/torchci/components/benchmark/llms/ModelGraphPanel.tsx b/torchci/components/benchmark/llms/ModelGraphPanel.tsx index e1b48bd238..45d066a151 100644 --- a/torchci/components/benchmark/llms/ModelGraphPanel.tsx +++ b/torchci/components/benchmark/llms/ModelGraphPanel.tsx @@ -18,6 +18,7 @@ import { TimeSeriesPanelWithData, } from "components/metrics/panels/TimeSeriesPanel"; import dayjs from "dayjs"; +import { computeSpeedup } from "lib/benchmark/aoUtils"; import { useBenchmark } from "lib/benchmark/llmUtils"; import { BranchAndCommit } from "lib/types"; @@ -26,6 +27,7 @@ const GRAPH_ROW_HEIGHT = 245; export function GraphPanel({ queryParams, granularity, + repoName, modelName, backendName, dtypeName, @@ -36,6 +38,7 @@ export function GraphPanel({ }: { queryParams: { [key: string]: any }; granularity: Granularity; + repoName: string; modelName: string; backendName: string; dtypeName: string; @@ -65,6 +68,8 @@ export function GraphPanel({ return <>; } + const dataWithSpeedup = computeSpeedup(repoName, data); + // Clamp to the nearest granularity (e.g. nearest hour) so that the times will // align with the data we get from the database const startTime = dayjs(queryParams["startTime"]).startOf(granularity); @@ -79,7 +84,7 @@ export function GraphPanel({ const chartData: { [k: string]: any } = {}; const graphSeries: { [k: string]: any } = {}; metricNames.forEach((metric: string) => { - chartData[metric] = data + chartData[metric] = dataWithSpeedup .filter((record: LLMsBenchmarkData) => { return ( record.model === modelName && diff --git a/torchci/components/benchmark/llms/SummaryPanel.tsx b/torchci/components/benchmark/llms/SummaryPanel.tsx index 74c00b8cf1..dffaa15187 100644 --- a/torchci/components/benchmark/llms/SummaryPanel.tsx +++ b/torchci/components/benchmark/llms/SummaryPanel.tsx @@ -63,43 +63,51 @@ export function SummaryPanel({ }, renderCell: (params: GridRenderCellParams) => { const model = params.value.model; - const dtype = params.value.dtype; - const deviceArch = `${params.value.device} (${params.value.arch})`; if (model === undefined) { return `Invalid model name`; } - if (dtype === undefined) { - return `Invalid dtype for model ${model}`; - } + const dtype = + params.value.dtype !== undefined + ? `&dtypeName=${encodeURIComponent(params.value.dtype)}` + : ""; const backend = params.value.backend !== undefined - ? `&${encodeURIComponent(params.value.backend)}` + ? `&backendName=${encodeURIComponent(params.value.backend)}` : ""; + const deviceArch = `${params.value.device} (${params.value.arch})`; + const url = `/benchmark/llms?startTime=${startTime}&stopTime=${stopTime}&granularity=${granularity}&repoName=${encodeURIComponent( repoName )}&modelName=${encodeURIComponent( model - )}${backend}&dtypeName=${encodeURIComponent( - dtype - )}&deviceName=${encodeURIComponent(deviceArch)}`; + )}${backend}${dtype}&deviceName=${encodeURIComponent(deviceArch)}`; const isNewModel = params.value.l === undefined ? "(NEW!) " : ""; const isModelStopRunning = params.value.r === undefined ? "❌" : ""; - const displayName = model.includes(dtype) - ? model - : `${model} (${dtype})`; return ( {isNewModel} - {isModelStopRunning} {displayName} + {isModelStopRunning} {model} ); }, }, ]; + const hasDtype = data.length > 0 && "dtype" in data[0] ? true : false; + if (hasDtype) { + columns.push({ + field: "dtype", + headerName: "Quantization", + flex: 1, + renderCell: (params: GridRenderCellParams) => { + return `${params.value}`; + }, + }); + } + const hasBackend = data.length > 0 && "backend" in data[0] ? true : false; if (hasBackend) { columns.push({ @@ -155,18 +163,23 @@ export function SummaryPanel({ return styles.error; } - // Higher value - if (r - l > RELATIVE_THRESHOLD * l) { - return IS_INCREASING_METRIC_VALUE_GOOD[metric] - ? styles.ok - : styles.error; - } - - // Lower value - if (l - r > RELATIVE_THRESHOLD * r) { - return IS_INCREASING_METRIC_VALUE_GOOD[metric] - ? styles.error - : styles.ok; + if (metric in IS_INCREASING_METRIC_VALUE_GOOD) { + // Higher value + if (r - l > RELATIVE_THRESHOLD * l) { + return IS_INCREASING_METRIC_VALUE_GOOD[metric] + ? styles.ok + : styles.error; + } + + // Lower value + if (l - r > RELATIVE_THRESHOLD * r) { + return IS_INCREASING_METRIC_VALUE_GOOD[metric] + ? styles.error + : styles.ok; + } + } else { + // No data + return ""; } } diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx index 53d131f2d2..2f95c11e24 100644 --- a/torchci/components/benchmark/llms/common.tsx +++ b/torchci/components/benchmark/llms/common.tsx @@ -1,6 +1,6 @@ import { BranchAndCommit } from "lib/types"; -export const REPOS = ["pytorch/pytorch", "pytorch/executorch"]; +export const REPOS = ["pytorch/pytorch", "pytorch/executorch", "pytorch/ao"]; export const REPO_TO_BENCHMARKS: { [k: string]: string[] } = { "pytorch/pytorch": ["PyTorch gpt-fast benchmark"], "pytorch/executorch": ["ExecuTorch"], @@ -23,6 +23,7 @@ export const IS_INCREASING_METRIC_VALUE_GOOD: { [k: string]: boolean } = { token_per_sec: true, flops_utilization: true, "compilation_time(s)": false, + speedup: true, }; export const METRIC_DISPLAY_SHORT_HEADERS: { [k: string]: string } = { "memory_bandwidth(GB/s)": "Bandwidth", @@ -40,9 +41,9 @@ export const RELATIVE_THRESHOLD = 0.05; export interface LLMsBenchmarkData { granularity_bucket: string; model: string; - backend?: string; + backend: string; workflow_id: number; - job_id?: number; + job_id: number; metric: string; actual: number; target: number; diff --git a/torchci/lib/benchmark/aoUtils.ts b/torchci/lib/benchmark/aoUtils.ts index 923724ca7f..f8e9911198 100644 --- a/torchci/lib/benchmark/aoUtils.ts +++ b/torchci/lib/benchmark/aoUtils.ts @@ -1,5 +1,15 @@ +import { LLMsBenchmarkData } from "components/benchmark/llms/common"; import { BenchmarkData, CompilerPerformanceData } from "lib/types"; +export const TORCHAO_REPO = "pytorch/ao"; +// TODO (huydhn): Find a better way to abstract this baseline concept, for example, +// this could be dtype noquant for TorchAO, or eager config for inductor +export const TORCHAO_BASELINE = "noquant"; +// TODO (huydhn): The following are TorchAO speedup metrics. Check with ao team to +// see if this information could be codified on the benchmark instead of keeping it +// here on the dashboard +const SPEEDUP_METRICS = ["tok/s", "time_ms(avg)", "time_s(avg)", "img_s(avg)"]; + // TODO (huydhn): Use this function to convert the generic benchmark data to the old // CompilerPerformanceData format. This is needed until the TorchInductor dashboard // is migrated to the new format @@ -43,3 +53,50 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) { return Object.values(convertData); } + +export function computeSpeedup(repoName: string, data: LLMsBenchmarkData[]) { + if (repoName !== TORCHAO_REPO) { + return data; + } + + const baselineMetrics: { [key: string]: LLMsBenchmarkData } = {}; + data.forEach((r: LLMsBenchmarkData) => { + if (r.dtype !== TORCHAO_BASELINE) { + return; + } + + const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`; + baselineMetrics[k] = r; + }); + + const withSpeedup: LLMsBenchmarkData[] = []; + data.forEach((r: LLMsBenchmarkData) => { + if (r.dtype === TORCHAO_BASELINE) { + return; + } + + if (SPEEDUP_METRICS.includes(r.metric)) { + const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`; + if ( + k in baselineMetrics && + baselineMetrics[k].actual !== 0 && + r.actual !== 0 + ) { + const speedup = r.metric.includes("time") + ? baselineMetrics[k].actual / r.actual + : r.actual / baselineMetrics[k].actual; + + withSpeedup.push({ + ...r, + metric: "speedup", + actual: Number(speedup.toFixed(4)), + target: 0, + }); + } + } + + withSpeedup.push(r); + }); + + return withSpeedup; +} diff --git a/torchci/lib/benchmark/llmUtils.ts b/torchci/lib/benchmark/llmUtils.ts index c804f6d1d5..bf5f638bfe 100644 --- a/torchci/lib/benchmark/llmUtils.ts +++ b/torchci/lib/benchmark/llmUtils.ts @@ -118,6 +118,10 @@ export function combineLeftAndRight( row["metadata"]["r"] ?? (hasR ? record["r"]["job_id"] : undefined); } + if (dtype !== "") { + row["dtype"] = dtype; + } + if (backend !== "") { row["backend"] = backend; } diff --git a/torchci/pages/benchmark/llms.tsx b/torchci/pages/benchmark/llms.tsx index d610c00327..50db7da7d7 100644 --- a/torchci/pages/benchmark/llms.tsx +++ b/torchci/pages/benchmark/llms.tsx @@ -21,6 +21,7 @@ import CopyLink from "components/CopyLink"; import GranularityPicker from "components/GranularityPicker"; import { Granularity } from "components/metrics/panels/TimeSeriesPanel"; import dayjs from "dayjs"; +import { computeSpeedup, TORCHAO_BASELINE } from "lib/benchmark/aoUtils"; import { useBenchmark } from "lib/benchmark/llmUtils"; import { fetcher } from "lib/GeneralUtils"; import { BranchAndCommit } from "lib/types"; @@ -81,6 +82,13 @@ function Report({ ); } + const lDataWithSpeedup = computeSpeedup(repoName, lData); + const rDataWithSpeedup = computeSpeedup(repoName, rData); + + if (repoName === "pytorch/ao") { + metricNames = ["speedup", ...metricNames]; + } + return (
@@ -237,7 +246,12 @@ export default function Page() { const queryName = "oss_ci_benchmark_names"; const queryParams = { deviceArch: deviceName === DEFAULT_DEVICE_NAME ? "" : deviceName, - dtypes: dtypeName === DEFAULT_DTYPE_NAME ? [] : [dtypeName], + dtypes: + dtypeName === DEFAULT_DTYPE_NAME + ? [] + : repoName !== "pytorch/ao" + ? [dtypeName] + : [dtypeName, TORCHAO_BASELINE], excludedMetrics: EXCLUDED_METRICS, benchmarks: REPO_TO_BENCHMARKS[repoName], granularity: granularity, @@ -274,7 +288,10 @@ export default function Page() { ]; const dtypeNames: string[] = _.compact([ DEFAULT_DTYPE_NAME, - ...(_.uniq(data.map((r: any) => r.dtype)) as string[]), + ..._.filter( + _.uniq(data.map((r: any) => r.dtype)) as string[], + (r: string) => r !== TORCHAO_BASELINE + ), ]); const metricNames: string[] = _.uniq(data.map((r: any) => r.metric)); @@ -372,7 +389,6 @@ export default function Page() { useClickHouse={true} /> -