Add speedup metric for TorchAO (#6118)

This is my initial attempt to add the speedup metric for TorchAO. This is done by comparing the gain of `autoquant` v.s. `noquant`. This is by no means the best approach because it requires custom logic for TorchAO on the dashboard. On the other hand, it's easy to implement and I think it's better to have the UX done first to gather early feedbacks from @jerryzh168 and the rest of ao team first. IMO, better approaches would be to either 1) set the speedup metric on TorchAO side or 2) compute the speed up metric on ClickHouse. Both are more involved and requires further design discussion. ### Testing https://torchci-git-fork-huydhn-add-speedup-llm-dashboard-fbopensource.vercel.app/benchmark/llms?repoName=pytorch%2Fao
pytorch · Dec 28, 2024 · d407580 · d407580
1 parent 6f108ab
commit d407580
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 38 deletions.
diff --git a/torchci/components/benchmark/llms/ModelGraphPanel.tsx b/torchci/components/benchmark/llms/ModelGraphPanel.tsx
@@ -18,6 +18,7 @@ import {
   TimeSeriesPanelWithData,
 } from "components/metrics/panels/TimeSeriesPanel";
 import dayjs from "dayjs";
+import { computeSpeedup } from "lib/benchmark/aoUtils";
 import { useBenchmark } from "lib/benchmark/llmUtils";
 import { BranchAndCommit } from "lib/types";
 
@@ -26,6 +27,7 @@ const GRAPH_ROW_HEIGHT = 245;
 export function GraphPanel({
   queryParams,
   granularity,
+  repoName,
   modelName,
   backendName,
   dtypeName,
@@ -36,6 +38,7 @@ export function GraphPanel({
 }: {
   queryParams: { [key: string]: any };
   granularity: Granularity;
+  repoName: string;
   modelName: string;
   backendName: string;
   dtypeName: string;
@@ -65,6 +68,8 @@ export function GraphPanel({
     return <></>;
   }
 
+  const dataWithSpeedup = computeSpeedup(repoName, data);
+
   // Clamp to the nearest granularity (e.g. nearest hour) so that the times will
   // align with the data we get from the database
   const startTime = dayjs(queryParams["startTime"]).startOf(granularity);
@@ -79,7 +84,7 @@ export function GraphPanel({
   const chartData: { [k: string]: any } = {};
   const graphSeries: { [k: string]: any } = {};
   metricNames.forEach((metric: string) => {
-    chartData[metric] = data
+    chartData[metric] = dataWithSpeedup
       .filter((record: LLMsBenchmarkData) => {
         return (
           record.model === modelName &&

diff --git a/torchci/components/benchmark/llms/SummaryPanel.tsx b/torchci/components/benchmark/llms/SummaryPanel.tsx
@@ -63,43 +63,51 @@ export function SummaryPanel({
       },
       renderCell: (params: GridRenderCellParams<any>) => {
         const model = params.value.model;
-        const dtype = params.value.dtype;
-        const deviceArch = `${params.value.device} (${params.value.arch})`;
         if (model === undefined) {
           return `Invalid model name`;
         }
-        if (dtype === undefined) {
-          return `Invalid dtype for model ${model}`;
-        }
 
+        const dtype =
+          params.value.dtype !== undefined
+            ? `&dtypeName=${encodeURIComponent(params.value.dtype)}`
+            : "";
         const backend =
           params.value.backend !== undefined
-            ? `&${encodeURIComponent(params.value.backend)}`
+            ? `&backendName=${encodeURIComponent(params.value.backend)}`
             : "";
+        const deviceArch = `${params.value.device} (${params.value.arch})`;
+
         const url = `/benchmark/llms?startTime=${startTime}&stopTime=${stopTime}&granularity=${granularity}&repoName=${encodeURIComponent(
           repoName
         )}&modelName=${encodeURIComponent(
           model
-        )}${backend}&dtypeName=${encodeURIComponent(
-          dtype
-        )}&deviceName=${encodeURIComponent(deviceArch)}`;
+        )}${backend}${dtype}&deviceName=${encodeURIComponent(deviceArch)}`;
 
         const isNewModel = params.value.l === undefined ? "(NEW!) " : "";
         const isModelStopRunning = params.value.r === undefined ? "❌" : "";
 
-        const displayName = model.includes(dtype)
-          ? model
-          : `${model} (${dtype})`;
         return (
           <a href={url}>
             {isNewModel}
-            {isModelStopRunning}&nbsp;<b>{displayName}</b>
+            {isModelStopRunning}&nbsp;<b>{model}</b>
           </a>
         );
       },
     },
   ];
 
+  const hasDtype = data.length > 0 && "dtype" in data[0] ? true : false;
+  if (hasDtype) {
+    columns.push({
+      field: "dtype",
+      headerName: "Quantization",
+      flex: 1,
+      renderCell: (params: GridRenderCellParams<any>) => {
+        return `${params.value}`;
+      },
+    });
+  }
+
   const hasBackend = data.length > 0 && "backend" in data[0] ? true : false;
   if (hasBackend) {
     columns.push({
@@ -155,18 +163,23 @@ export function SummaryPanel({
                 return styles.error;
               }
 
-              // Higher value
-              if (r - l > RELATIVE_THRESHOLD * l) {
-                return IS_INCREASING_METRIC_VALUE_GOOD[metric]
-                  ? styles.ok
-                  : styles.error;
-              }
-
-              // Lower value
-              if (l - r > RELATIVE_THRESHOLD * r) {
-                return IS_INCREASING_METRIC_VALUE_GOOD[metric]
-                  ? styles.error
-                  : styles.ok;
+              if (metric in IS_INCREASING_METRIC_VALUE_GOOD) {
+                // Higher value
+                if (r - l > RELATIVE_THRESHOLD * l) {
+                  return IS_INCREASING_METRIC_VALUE_GOOD[metric]
+                    ? styles.ok
+                    : styles.error;
+                }
+
+                // Lower value
+                if (l - r > RELATIVE_THRESHOLD * r) {
+                  return IS_INCREASING_METRIC_VALUE_GOOD[metric]
+                    ? styles.error
+                    : styles.ok;
+                }
+              } else {
+                // No data
+                return "";
               }
             }
 

diff --git a/torchci/components/benchmark/llms/common.tsx b/torchci/components/benchmark/llms/common.tsx
@@ -1,6 +1,6 @@
 import { BranchAndCommit } from "lib/types";
 
-export const REPOS = ["pytorch/pytorch", "pytorch/executorch"];
+export const REPOS = ["pytorch/pytorch", "pytorch/executorch", "pytorch/ao"];
 export const REPO_TO_BENCHMARKS: { [k: string]: string[] } = {
   "pytorch/pytorch": ["PyTorch gpt-fast benchmark"],
   "pytorch/executorch": ["ExecuTorch"],
@@ -23,6 +23,7 @@ export const IS_INCREASING_METRIC_VALUE_GOOD: { [k: string]: boolean } = {
   token_per_sec: true,
   flops_utilization: true,
   "compilation_time(s)": false,
+  speedup: true,
 };
 export const METRIC_DISPLAY_SHORT_HEADERS: { [k: string]: string } = {
   "memory_bandwidth(GB/s)": "Bandwidth",
@@ -40,9 +41,9 @@ export const RELATIVE_THRESHOLD = 0.05;
 export interface LLMsBenchmarkData {
   granularity_bucket: string;
   model: string;
-  backend?: string;
+  backend: string;
   workflow_id: number;
-  job_id?: number;
+  job_id: number;
   metric: string;
   actual: number;
   target: number;

diff --git a/torchci/lib/benchmark/aoUtils.ts b/torchci/lib/benchmark/aoUtils.ts
@@ -1,5 +1,15 @@
+import { LLMsBenchmarkData } from "components/benchmark/llms/common";
 import { BenchmarkData, CompilerPerformanceData } from "lib/types";
 
+export const TORCHAO_REPO = "pytorch/ao";
+// TODO (huydhn): Find a better way to abstract this baseline concept, for example,
+// this could be dtype noquant for TorchAO, or eager config for inductor
+export const TORCHAO_BASELINE = "noquant";
+// TODO (huydhn): The following are TorchAO speedup metrics. Check with ao team to
+// see if this information could be codified on the benchmark instead of keeping it
+// here on the dashboard
+const SPEEDUP_METRICS = ["tok/s", "time_ms(avg)", "time_s(avg)", "img_s(avg)"];
+
 // TODO (huydhn): Use this function to convert the generic benchmark data to the old
 // CompilerPerformanceData format. This is needed until the TorchInductor dashboard
 // is migrated to the new format
@@ -43,3 +53,50 @@ export function convertToCompilerPerformanceData(data: BenchmarkData[]) {
 
   return Object.values(convertData);
 }
+
+export function computeSpeedup(repoName: string, data: LLMsBenchmarkData[]) {
+  if (repoName !== TORCHAO_REPO) {
+    return data;
+  }
+
+  const baselineMetrics: { [key: string]: LLMsBenchmarkData } = {};
+  data.forEach((r: LLMsBenchmarkData) => {
+    if (r.dtype !== TORCHAO_BASELINE) {
+      return;
+    }
+
+    const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`;
+    baselineMetrics[k] = r;
+  });
+
+  const withSpeedup: LLMsBenchmarkData[] = [];
+  data.forEach((r: LLMsBenchmarkData) => {
+    if (r.dtype === TORCHAO_BASELINE) {
+      return;
+    }
+
+    if (SPEEDUP_METRICS.includes(r.metric)) {
+      const k = `${r.workflow_id} ${r.job_id} ${r.model} ${r.metric} ${r.device} ${r.arch}`;
+      if (
+        k in baselineMetrics &&
+        baselineMetrics[k].actual !== 0 &&
+        r.actual !== 0
+      ) {
+        const speedup = r.metric.includes("time")
+          ? baselineMetrics[k].actual / r.actual
+          : r.actual / baselineMetrics[k].actual;
+
+        withSpeedup.push({
+          ...r,
+          metric: "speedup",
+          actual: Number(speedup.toFixed(4)),
+          target: 0,
+        });
+      }
+    }
+
+    withSpeedup.push(r);
+  });
+
+  return withSpeedup;
+}
diff --git a/torchci/lib/benchmark/llmUtils.ts b/torchci/lib/benchmark/llmUtils.ts
@@ -118,6 +118,10 @@ export function combineLeftAndRight(
           row["metadata"]["r"] ?? (hasR ? record["r"]["job_id"] : undefined);
       }
 
+      if (dtype !== "") {
+        row["dtype"] = dtype;
+      }
+
       if (backend !== "") {
         row["backend"] = backend;
       }

diff --git a/torchci/pages/benchmark/llms.tsx b/torchci/pages/benchmark/llms.tsx
@@ -21,6 +21,7 @@ import CopyLink from "components/CopyLink";
 import GranularityPicker from "components/GranularityPicker";
 import { Granularity } from "components/metrics/panels/TimeSeriesPanel";
 import dayjs from "dayjs";
+import { computeSpeedup, TORCHAO_BASELINE } from "lib/benchmark/aoUtils";
 import { useBenchmark } from "lib/benchmark/llmUtils";
 import { fetcher } from "lib/GeneralUtils";
 import { BranchAndCommit } from "lib/types";
@@ -81,22 +82,29 @@ function Report({
     );
   }
 
+  const lDataWithSpeedup = computeSpeedup(repoName, lData);
+  const rDataWithSpeedup = computeSpeedup(repoName, rData);
+
+  if (repoName === "pytorch/ao") {
+    metricNames = ["speedup", ...metricNames];
+  }
+
   return (
     <div>
       <CommitPanel
         repoName={repoName}
         lBranchAndCommit={{
           ...rBranchAndCommit,
           date:
-            rData !== undefined && rData.length !== 0
-              ? rData[0].granularity_bucket
+            rDataWithSpeedup !== undefined && rDataWithSpeedup.length !== 0
+              ? rDataWithSpeedup[0].granularity_bucket
               : undefined,
         }}
         rBranchAndCommit={{
           ...lBranchAndCommit,
           date:
-            lData !== undefined && lData.length !== 0
-              ? lData[0].granularity_bucket
+            lDataWithSpeedup !== undefined && lDataWithSpeedup.length !== 0
+              ? lDataWithSpeedup[0].granularity_bucket
               : undefined,
         }}
         workflowName={"inductor-micro-benchmark"}
@@ -106,6 +114,7 @@ function Report({
       <GraphPanel
         queryParams={queryParams}
         granularity={granularity}
+        repoName={repoName}
         modelName={modelName}
         backendName={backendName}
         dtypeName={dtypeName}
@@ -124,11 +133,11 @@ function Report({
         metricNames={metricNames}
         lPerfData={{
           ...lBranchAndCommit,
-          data: lData,
+          data: lDataWithSpeedup,
         }}
         rPerfData={{
           ...rBranchAndCommit,
-          data: rData,
+          data: rDataWithSpeedup,
         }}
       />
     </div>
@@ -237,7 +246,12 @@ export default function Page() {
   const queryName = "oss_ci_benchmark_names";
   const queryParams = {
     deviceArch: deviceName === DEFAULT_DEVICE_NAME ? "" : deviceName,
-    dtypes: dtypeName === DEFAULT_DTYPE_NAME ? [] : [dtypeName],
+    dtypes:
+      dtypeName === DEFAULT_DTYPE_NAME
+        ? []
+        : repoName !== "pytorch/ao"
+        ? [dtypeName]
+        : [dtypeName, TORCHAO_BASELINE],
     excludedMetrics: EXCLUDED_METRICS,
     benchmarks: REPO_TO_BENCHMARKS[repoName],
     granularity: granularity,
@@ -274,7 +288,10 @@ export default function Page() {
   ];
   const dtypeNames: string[] = _.compact([
     DEFAULT_DTYPE_NAME,
-    ...(_.uniq(data.map((r: any) => r.dtype)) as string[]),
+    ..._.filter(
+      _.uniq(data.map((r: any) => r.dtype)) as string[],
+      (r: string) => r !== TORCHAO_BASELINE
+    ),
   ]);
   const metricNames: string[] = _.uniq(data.map((r: any) => r.metric));
 
@@ -372,7 +389,6 @@ export default function Page() {
           useClickHouse={true}
         />
       </Stack>
-
       <Report
         queryParams={queryParams}
         startTime={startTime}