SqueezeAILab · abhilash1910 · Sep 6, 2024
diff --git a/llama.py b/llama.py
@@ -12,8 +12,13 @@
     get_embedding,
     get_norm,
 )
-
-
+try:
+    import intel_extension_for_pytorch
+    def is_xpu_available():
+        return  torch.xpu.is_available()
+except Exception as e:
+    print(f"Building SYCL kernels requires either Pytorch>=2.4 or installation of IPEX to run on Intel GPUs")
+
 def get_model(model):
     import torch
 
@@ -78,7 +83,10 @@ def forward(self, inp, **kwargs):
     layers[0] = layers[0].cpu()
     for i in range(len(embeddings)):
         embeddings[i] = embeddings[i].cpu()
-    torch.cuda.empty_cache()
+    if is_xpu_available():
+        torch.xpu.empty_cache()
+    else:
+        torch.cuda.empty_cache()
 
     outs = torch.zeros_like(inps)
     attention_mask = cache["attention_mask"]
@@ -103,7 +111,10 @@ def forward(self, inp, **kwargs):
                 )[0]
         layers[i] = layer.cpu()
         del layer
-        torch.cuda.empty_cache()
+        if is_xpu_available:
+            torch.xpu.empty_cache()
+        else:
+            torch.cuda.empty_cache()
         inps, outs = outs, inps
 
     norm = get_norm(model, model_type)
@@ -192,7 +203,10 @@ def benchmark(model, input_ids, check=False):
     layers = get_layers(model, model_type)
 
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, "gpus") else DEV)
-    torch.cuda.synchronize()
+    if is_xpu_availble():
+        torch.xpu.synchronize()
+    else:
+        torch.cuda.synchronize(gpu)
 
     cache = {"past": None}
 
@@ -215,9 +229,16 @@ def tmp(layer, inp, out):
     def sync():
         if hasattr(model, "gpus"):
             for gpu in model.gpus:
-                torch.cuda.synchronize(gpu)
+                if is_xpu_availble():
+                    torch.xpu.synchronize()
+                else:
+                    torch.cuda.synchronize(gpu)
         else:
-            torch.cuda.synchronize()
+            if is_xpu_availble():
+                torch.xpu.synchronize()
+            else:
+                torch.cuda.synchronize(gpu)
+
 
     max_memory = 0
     with torch.no_grad():
@@ -233,7 +254,10 @@ def sync():
             sync()
             times.append(time.time() - tick)
             print(i, times[-1])
-            max_memory = max(max_memory, torch.cuda.memory_allocated() / 1024 / 1024)
+            if is_xpu_available:
+                max_memory = max(max_memory, torch.xpu.memory_allocated() / 1024 / 1024)
+            else:
+                max_memory = max(max_memory, torch.cuda.memory_allocated() / 1024 / 1024)
             if check and i != input_ids.numel() - 1:
                 tot += loss(
                     out.logits[0].to(DEV), input_ids[:, (i + 1)].to(DEV)
@@ -252,7 +276,7 @@ def sync():
 if __name__ == "__main__":
     import argparse
     from squeezellm.datautils import *
-
+    
     parser = argparse.ArgumentParser()
 
     parser.add_argument("model", type=str, help="llama model to load")
@@ -304,8 +328,10 @@ def sync():
         default=10,
         help="Number of dense channel used for hybrid kernel.",
     )
-
-    DEV = torch.device("cuda:0")
+    if is_xpu_available:
+        DEV = torch.device("xpu")
+    else:
+        DEV = torch.device("cuda:0")
 
     args = parser.parse_args()
 
@@ -344,7 +370,7 @@ def sync():
                 with torch.profiler.profile(
                     activities=[
                         torch.profiler.ProfilerActivity.CPU,
-                        torch.profiler.ProfilerActivity.CUDA,
+                        torch.profiler.ProfilerActivity.CUDA
                     ]
                 ) as p:
                     benchmark(model, input_ids, check=args.check)

diff --git a/squeezellm/quant.py b/squeezellm/quant.py
@@ -2,8 +2,15 @@
 import torch
 import torch.nn as nn
 import math
-import quant_cuda
+try:
+    import intel_extension_for_pytorch
+    def is_xpu_available():
+        return  torch.xpu.is_available()
+except Exception as e:
+    print(f"Building SYCL kernels requires either Pytorch>=2.4 or installation of IPEX to run on Intel GPUs")
 
+import quant_cuda
+
 
 def round_to_nearest_pole_sim(w, poles):
     """
@@ -215,7 +222,10 @@ def forward(self, x):
                 y = self.bias.clone()
                 outshape[-1] = self.bias.numel()
             else:
-                y = torch.zeros((self.outfeatures), device="cuda", dtype=torch.float32)
+                if is_xpu_available:
+                    y = torch.zeros((self.outfeatures), device="xpu", dtype=torch.float32)
+                else:
+                    y = torch.zeros((self.outfeatures), device="cuda", dtype=torch.float32)
                 outshape[-1] = self.outfeatures
             dtype = x.dtype
 
@@ -313,9 +323,14 @@ def forward(self, x):
         else:
             out_shape = x.shape[:-1] + (self.outfeatures,)
             x = x.reshape(-1, x.shape[-1])
-            out = torch.zeros(
-                (x.shape[0], self.outfeatures), device="cuda", dtype=torch.float32
-            )
+            if is_xpu_available:
+                out = torch.zeros(
+                    (x.shape[0], self.outfeatures), device="xpu", dtype=torch.float32
+                )
+            else:
+                out = torch.zeros(
+                    (x.shape[0], self.outfeatures), device="cuda", dtype=torch.float32
+                )
             dtype = x.dtype
             if self.bits == 3:
                 x = x.float()