diff --git a/topaz/commands/denoise.py b/topaz/commands/denoise.py
index 5c81d6c..0434929 100644
--- a/topaz/commands/denoise.py
+++ b/topaz/commands/denoise.py
@@ -16,7 +16,11 @@
 from topaz.utils.data.loader import load_image
 from topaz.utils.image import downsample
 import topaz.mrc as mrc
-import topaz.cuda
+import topaz.gpu
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 name = 'denoise'
 help = 'denoise micrographs with various denoising algorithms'
@@ -265,13 +269,12 @@ def make_hdf5_datasets(path, paired=True, preload=False, holdout=0.1, cutoff=0):
 
 def denoise_image(mic, models, lowpass=1, cutoff=0, gaus=None, inv_gaus=None, deconvolve=False
                  , deconv_patch=1, patch_size=-1, padding=0, normalize=False
-                 , use_cuda=False):
+                 , device='cpu'):
     if lowpass > 1:
         mic = dn.lowpass(mic, lowpass)
 
     mic = torch.from_numpy(mic)
-    if use_cuda:
-        mic = mic.cuda()
+    mic = mic.to(device)
 
     # normalize and remove outliers
     mu = mic.mean()
@@ -316,8 +319,8 @@ def main(args):
     set_num_threads(num_threads)
 
     ## set the device
-    use_cuda = topaz.cuda.set_device(args.device)
-    print('# using device={} with cuda={}'.format(args.device, use_cuda), file=sys.stderr)
+    device = topaz.gpu.set_device(args.device)
+    print('# Using device={} with GPU={}'.format(args.device, device), file=sys.stderr)
 
     cutoff = args.pixel_cutoff # pixel truncation limit
 
@@ -393,8 +396,7 @@ def main(args):
         else:
             raise Exception('Unknown architecture: ' + arch)
 
-        if use_cuda:
-            model = model.cuda()
+        model = model.to(device)
 
         # train
         optim = args.optim
@@ -417,7 +419,7 @@ def main(args):
                                            , criteria=criteria
                                            , num_epochs=num_epochs
                                            , dataset_val=dataset_val
-                                           , use_cuda=use_cuda
+                                           , device=device
                                            , num_workers=num_workers
                                            , shuffle=shuffle
                                            )
@@ -428,7 +430,7 @@ def main(args):
                                             , criteria=criteria
                                             , num_epochs=num_epochs
                                             , dataset_val=dataset_val
-                                            , use_cuda=use_cuda
+                                            , device=device
                                             , num_workers=num_workers
                                             , shuffle=shuffle
                                             )
@@ -446,8 +448,7 @@ def main(args):
                 model.cpu()
                 model.eval()
                 torch.save(model, path)
-                if use_cuda:
-                    model.cuda()
+                model.to(device)
                     
         models = [model]
 
@@ -461,8 +462,7 @@ def main(args):
             model = dn.load_model(arg)
 
             model.eval()
-            if use_cuda:
-                model.cuda()
+            model.to(device)
 
             models.append(model)
 
@@ -481,15 +481,13 @@ def main(args):
     gaus = args.gaussian
     if gaus > 0:
         gaus = dn.GaussianDenoise(gaus)
-        if use_cuda:
-            gaus.cuda()
+        gaus.to(device)
     else:
         gaus = None
     inv_gaus = args.inv_gaussian
     if inv_gaus > 0:
         inv_gaus = dn.InvGaussianFilter(inv_gaus)
-        if use_cuda:
-            inv_gaus.cuda()
+        inv_gaus.to(device)
     else:
         inv_gaus = None
     deconvolve = args.deconvolve
@@ -516,7 +514,7 @@ def main(args):
                                , inv_gaus=inv_gaus, deconvolve=deconvolve
                                , deconv_patch=deconv_patch
                                , patch_size=ps, padding=padding, normalize=normalize
-                               , use_cuda=use_cuda
+                               , device=device
                                )
             denoised[i] = mic
 
@@ -538,7 +536,7 @@ def main(args):
             return
 
         # make the output directory if it doesn't exist
-        if not os.path.exists(args.output):
+        if args.output and (not os.path.exists(args.output)):
             os.makedirs(args.output)
 
         for path in args.micrographs:
@@ -550,7 +548,7 @@ def main(args):
                                , inv_gaus=inv_gaus, deconvolve=deconvolve
                                , deconv_patch=deconv_patch
                                , patch_size=ps, padding=padding, normalize=normalize
-                               , use_cuda=use_cuda
+                               , device=device
                                )
 
             # write the micrograph
diff --git a/topaz/commands/denoise3d.py b/topaz/commands/denoise3d.py
index b47cb47..8e7077b 100644
--- a/topaz/commands/denoise3d.py
+++ b/topaz/commands/denoise3d.py
@@ -19,7 +19,11 @@
 from topaz.utils.data.loader import load_image
 from topaz.utils.image import downsample
 import topaz.mrc as mrc
-import topaz.cuda
+import topaz.gpu
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 from topaz.denoise import UDenoiseNet3D
 from topaz.filters import GaussianDenoise
@@ -73,19 +77,20 @@ def add_arguments(parser=None):
 
     return parser
 
-def train_epoch(iterator, model, cost_func, optim, epoch=1, num_epochs=1, N=1, use_cuda=False):
+def train_epoch(iterator, model, cost_func, optim, epoch=1, num_epochs=1, N=1, device='cpu'):
     
     c = 0
     loss_accum = 0    
     model.train()
+#    if 'ipex' in dir():
+#        model, optim = ipex.optimize(model, optimizer=optim)
 
     for batch_idx , (source,target), in enumerate(iterator):
         
         b = source.size(0)        
         loss_mb = 0
-        if use_cuda:
-            source = source.cuda()
-            target = target.cuda()
+        source = source.to(device)
+        target = target.to(device)
             
         denoised_source = model(source)
         loss = cost_func(denoised_source,target)
@@ -108,7 +113,7 @@ def train_epoch(iterator, model, cost_func, optim, epoch=1, num_epochs=1, N=1, u
     return loss_accum
 
 
-def eval_model(iterator, model, cost_func, epoch=1, num_epochs=1, N=1, use_cuda=False):
+def eval_model(iterator, model, cost_func, epoch=1, num_epochs=1, N=1, device='cpu'):
     
     c = 0
     loss_accum = 0
@@ -119,9 +124,8 @@ def eval_model(iterator, model, cost_func, epoch=1, num_epochs=1, N=1, use_cuda=
             
             b = source.size(0)        
             loss_mb = 0
-            if use_cuda:
-                source = source.cuda()
-                target = target.cuda()
+            source = source.to(device)
+            target = target.to(device)
                 
             denoised_source = model(source)
             loss = cost_func(denoised_source,target)
@@ -408,7 +412,7 @@ def train_model(even_path, odd_path, save_prefix, save_interval, device
     # initialize the model
     print('# initializing model...', file=log)
     model_base = UDenoiseNet3D(base_width=base_kernel_width)
-    model,use_cuda,num_devices = set_device(model_base, device)
+    model,use_device,num_devices = set_device(model_base, device)
     
     if cost_func == 'L2':
         cost_func = nn.MSELoss()
@@ -469,7 +473,7 @@ def train_model(even_path, odd_path, save_prefix, save_interval, device
                                        epoch=epoch,
                                        num_epochs=num_epochs,
                                        N=N_train,
-                                       use_cuda=use_cuda)
+                                       device=use_device)
 
         line = '\t'.join([str(epoch+1), 'train', str(epoch_loss_accum)])
         print(line, file=output)
@@ -482,7 +486,7 @@ def train_model(even_path, odd_path, save_prefix, save_interval, device
                                    epoch=epoch,
                                    num_epochs=num_epochs,
                                    N=N_test,
-                                   use_cuda=use_cuda)
+                                   device=use_device)
     
         line = '\t'.join([str(epoch+1), 'test', str(epoch_loss_accum)])
         print(line, file=output)
@@ -491,8 +495,7 @@ def train_model(even_path, odd_path, save_prefix, save_interval, device
         if save_prefix is not None and (epoch+1)%save_interval == 0:
             model.eval().cpu()
             save_model(model, epoch+1, save_prefix, digits=digits)
-            if use_cuda:
-                model.cuda()
+            model.to(use_device)
 
     print('# training completed!', file=log)
 
@@ -557,17 +560,29 @@ def load_model(path, base_kernel_width=11):
 def set_device(model, device, log=sys.stderr):
     # set the device or devices
     d = device
-    use_cuda = (d != -1) and torch.cuda.is_available()
+    use_device = 'cpu'
+    if d != -1:
+        if torch.cuda.is_available():
+            import torch.cuda as acc
+            use_device = 'cuda'
+        elif hasattr(torch,'xpu'):
+            if torch.xpu.is_available():
+                import torch.xpu as acc
+                use_device = 'xpu'
+            else:
+                import torch.cpu as acc
+        else:
+            import torch.cpu as acc
     num_devices = 1
-    if use_cuda:
-        device_count = torch.cuda.device_count()
+    if use_device != 'cpu':
+        device_count = acc.device_count()
         try:
             if d >= 0:
                 assert d < device_count
-                torch.cuda.set_device(d)
-                print('# using CUDA device:', d, file=log)
+                acc.set_device(d)
+                print('# using GPU device:', d, file=log)
             elif d == -2:
-                print('# using all available CUDA devices:', device_count, file=log)
+                print('# using all available GPU devices:', device_count, file=log)
                 num_devices = device_count
                 model = nn.DataParallel(model)
             else:
@@ -579,10 +594,9 @@ def set_device(model, device, log=sys.stderr):
             print('ERROR: Something went wrong with setting the compute device', file=log)
             sys.exit(2)
 
-    if use_cuda:
-        model.cuda()
+        model.to(use_device)
 
-    return model, use_cuda, num_devices
+    return model, use_device, num_devices
 
 
 class PatchDataset:
@@ -756,7 +770,9 @@ def main(args):
             model = nn.Sequential(model, GaussianDenoise(gaussian_sigma, dims=3))
         model.eval()
         
-        model, use_cuda, num_devices = set_device(model, args.device)
+        model, use_device, num_devices = set_device(model, args.device)
+#        if 'ipex' in dir():
+#            model = ipex.optimize(model)
 
         #batch_size = args.batch_size
         #batch_size *= num_devices
@@ -783,4 +799,4 @@ def main(args):
 if __name__ == '__main__':
     parser = add_arguments()
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/topaz/commands/extract.py b/topaz/commands/extract.py
index fdd13ff..0cd34ec 100644
--- a/topaz/commands/extract.py
+++ b/topaz/commands/extract.py
@@ -18,7 +18,11 @@
 from topaz.algorithms import non_maximum_suppression, match_coordinates
 from topaz.metrics import average_precision
 import topaz.predict
-import topaz.cuda
+import topaz.gpu
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 name = 'extract'
 help = 'extract particles from segmented images or segment and extract in one step with a trained classifier'
@@ -187,15 +191,16 @@ def stream_images(paths):
 def score_images(model, paths, device=-1, batch_size=1):
     if model is not None and model != 'none': # score each image with the model
         ## set the device
-        use_cuda = topaz.cuda.set_device(device)
+        device = topaz.gpu.set_device(device)
         ## load the model
         from topaz.model.factory import load_model
         model = load_model(model)
         model.eval()
         model.fill()
-        if use_cuda:
-            model.cuda()
-        scores = topaz.predict.score_stream(model, stream_images(paths), use_cuda=use_cuda
+        model.to(device)
+#        if 'ipex' in dir():
+#            model = ipex.optimize(model)
+        scores = topaz.predict.score_stream(model, stream_images(paths), device=device
                                            , batch_size=batch_size)
     else: # load scores directly
         scores = stream_images(paths)
@@ -311,4 +316,4 @@ def main(args):
 if __name__ == '__main__':
     parser = add_arguments()
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/topaz/commands/normalize.py b/topaz/commands/normalize.py
index 0fd5caa..7ff9fbc 100644
--- a/topaz/commands/normalize.py
+++ b/topaz/commands/normalize.py
@@ -12,7 +12,11 @@
 from topaz.stats import normalize
 from topaz.utils.data.loader import load_image
 from topaz.utils.image import downsample, save_image
-import topaz.cuda
+import topaz.gpu
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 name = 'normalize'
 help = 'normalize a set of images using the 2-component Gaussian mixture model'
@@ -48,7 +52,7 @@ def add_arguments(parser=None):
 
 class Normalize:
     def __init__(self, dest, scale, affine, num_iters, alpha, beta
-                , sample, metadata, formats, use_cuda):
+                , sample, metadata, formats, device):
         self.dest = dest
         self.scale = scale
         self.affine = affine
@@ -58,7 +62,7 @@ def __init__(self, dest, scale, affine, num_iters, alpha, beta
         self.sample = sample
         self.metadata = metadata
         self.formats = formats
-        self.use_cuda = use_cuda
+        self.device = device
 
     def __call__(self, path):
         # load the image
@@ -72,7 +76,7 @@ def __call__(self, path):
         if self.affine:
             method = 'affine'
         x,metadata = normalize(x, alpha=self.alpha, beta=self.beta, num_iters=self.num_iters
-                              , method=method, sample=self.sample, use_cuda=self.use_cuda)
+                              , method=method, sample=self.sample, device=self.device)
 
         # save the image and the metadata
         name,_ = os.path.splitext(os.path.basename(path))
@@ -116,9 +120,9 @@ def main(args):
     from topaz.torch import set_num_threads
     set_num_threads(num_threads)
 
-    # set CUDA device
-    use_cuda = topaz.cuda.set_device(args.device)
-    if use_cuda:
+    # set GPU device
+    device = topaz.gpu.set_device(args.device)
+    if device != 'cpu':
         # when using GPU, turn off multiple processes
         num_workers = 0
 
@@ -126,7 +130,7 @@ def main(args):
         os.makedirs(dest)
 
     process = Normalize(dest, scale, affine, num_iters, alpha, beta
-                       , sample, metadata, formats, use_cuda)
+                       , sample, metadata, formats, device)
 
     if num_workers > 1:
         pool = mp.Pool(num_workers)
diff --git a/topaz/commands/segment.py b/topaz/commands/segment.py
index 66471d3..ff4762e 100644
--- a/topaz/commands/segment.py
+++ b/topaz/commands/segment.py
@@ -12,7 +12,11 @@
 import torch
 
 from topaz.utils.data.loader import load_image
-import topaz.cuda
+import topaz.gpu
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 name = 'segment'
 help = 'segment images using a trained region classifier'
@@ -43,7 +47,7 @@ def main(args):
     set_num_threads(num_threads)
 
     ## set the device
-    use_cuda = topaz.cuda.set_device(args.device)
+    device = topaz.gpu.set_device(args.device)
 
     ## load the model
     from topaz.model.factory import load_model
@@ -51,8 +55,9 @@ def main(args):
     model.eval()
     model.fill()
 
-    if use_cuda:
-        model.cuda()
+    model.to(device)
+#    if 'ipex' in dir():
+#        model = ipex.optimize(model)
 
     ## make output directory if doesn't exist
     destdir = args.destdir 
@@ -68,8 +73,7 @@ def main(args):
         ## process image with the model
         with torch.no_grad():
             X = torch.from_numpy(np.array(image, copy=False)).unsqueeze(0).unsqueeze(0)
-            if use_cuda:
-                X = X.cuda()
+            X = X.to(device)
             score = model(X).data[0,0].cpu().numpy()
         
         im = Image.fromarray(score) 
@@ -83,4 +87,4 @@ def main(args):
 if __name__ == '__main__':
     parser = add_arguments()
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/topaz/commands/train.py b/topaz/commands/train.py
index 7b83596..1acaabe 100644
--- a/topaz/commands/train.py
+++ b/topaz/commands/train.py
@@ -19,7 +19,11 @@
 from topaz.utils.printing import report
 from topaz.utils.data.loader import load_images_from_list
 from topaz.utils.data.coordinates import match_coordinates_to_images
-import topaz.cuda
+import topaz.gpu
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 name = 'train'
 help = 'train region classifier from images with labeled coordinates'
@@ -508,7 +512,7 @@ def make_data_iterators(train_images, train_targets, test_images, test_targets
     return train_iterator, test_iterator
 
 
-def evaluate_model(classifier, criteria, data_iterator, use_cuda=False):
+def evaluate_model(classifier, criteria, data_iterator, device='cpu'):
     from topaz.metrics import average_precision
 
     classifier.eval()
@@ -523,9 +527,8 @@ def evaluate_model(classifier, criteria, data_iterator, use_cuda=False):
         for X,Y in data_iterator:
             Y = Y.view(-1)
             Y_true.append(Y.numpy())
-            if use_cuda:
-                X = X.cuda()
-                Y = Y.cuda()
+            X = X.to(device)
+            Y = Y.to(device)
 
             score = classifier(X).view(-1)
 
@@ -551,12 +554,11 @@ def evaluate_model(classifier, criteria, data_iterator, use_cuda=False):
     return loss, precision, tpr, fpr, auprc
 
 
-def fit_epoch(step_method, data_iterator, epoch=1, it=1, use_cuda=False, output=sys.stdout):
+def fit_epoch(step_method, data_iterator, epoch=1, it=1, device='cpu', output=sys.stdout):
     for X,Y in data_iterator:
         Y = Y.view(-1)
-        if use_cuda:
-            X = X.cuda()
-            Y = Y.cuda()
+        X = X.to(device)
+        Y = Y.to(device)
         metrics = step_method.step(X, Y)
         line = '\t'.join([str(epoch), str(it), 'train'] + [str(metric) for metric in metrics] + ['-'])
         print(line, file=output)
@@ -566,7 +568,7 @@ def fit_epoch(step_method, data_iterator, epoch=1, it=1, use_cuda=False, output=
 
 
 def fit_epochs(classifier, criteria, step_method, train_iterator, test_iterator, num_epochs
-              , save_prefix=None, use_cuda=False, output=sys.stdout):
+              , save_prefix=None, device='cpu', output=sys.stdout):
     ## fit the model, report train/test stats, save model if required
     header = step_method.header
     line = '\t'.join(['epoch', 'iter', 'split'] + header + ['auprc'])
@@ -576,13 +578,15 @@ def fit_epochs(classifier, criteria, step_method, train_iterator, test_iterator,
     for epoch in range(1,num_epochs+1):
         ## update the model
         classifier.train()
+#        if 'ipex' in dir():
+#            classifier, step_method.optim = ipex.optimize(classifier, optimizer=step_method.optim)
         it = fit_epoch(step_method, train_iterator, epoch=epoch, it=it
-                      , use_cuda=use_cuda, output=output)
+                      , device=device, output=output)
 
         ## measure validation performance
         if test_iterator is not None:
             loss,precision,tpr,fpr,auprc = evaluate_model(classifier, criteria, test_iterator
-                                                         , use_cuda=use_cuda)
+                                                         , device=device)
             line = '\t'.join([str(epoch), str(it), 'test', str(loss)] + ['-']*(len(header)-4) + [str(precision), str(tpr), str(fpr), str(auprc)])
             print(line, file=output)
             output.flush()
@@ -594,8 +598,7 @@ def fit_epochs(classifier, criteria, step_method, train_iterator, test_iterator,
             path = prefix + ('_epoch{:0'+str(digits)+'}.sav').format(epoch) 
             classifier.cpu()
             torch.save(classifier, path)
-            if use_cuda:
-                classifier.cuda()
+            classifier.to(device)
 
 
 def main(args):
@@ -614,20 +617,18 @@ def main(args):
 
     ## set the device
     """
-    use_cuda = False
     if args.device >= 0:
-        use_cuda = torch.cuda.is_available()
-        if use_cuda:
-            torch.cuda.set_device(args.device)
+        use_gpu = torch.[cuda|xpu].is_available()
+        if use_gpu:
+            torch.[cuda|xpu].set_device(args.device)
         else:
-            print('WARNING: you specified GPU (device={}) but no GPUs were detected. This may mean there is a mismatch between your system CUDA version and your pytorch CUDA version.'.format(args.device), file=sys.stderr)
+            print('WARNING: you specified GPU (device={}) but no GPUs were detected. This may mean there is a mismatch between your system GPU and your pytorch GPU version.'.format(args.device), file=sys.stderr)
     """
 
-    use_cuda = topaz.cuda.set_device(args.device)
-    report('Using device={} with cuda={}'.format(args.device, use_cuda))
+    device = topaz.gpu.set_device(args.device)
+    report('Using device={} with GPU={}'.format(args.device, device))
 
-    if use_cuda:
-        classifier.cuda()
+    classifier.to(device)
     
     ## load the data
     radius = args.radius # number of pixels around coordinates to label as positive
@@ -695,7 +696,7 @@ def main(args):
     #if not os.path.exists(os.path.dirname(save_prefix)):
     #    os.makedirs(os.path.dirname(save_prefix))
     fit_epochs(classifier, criteria, trainer, train_iterator, test_iterator, args.num_epochs
-              , save_prefix=save_prefix, use_cuda=use_cuda, output=output)
+              , save_prefix=save_prefix, device=device, output=output)
 
     report('Done!')
 
diff --git a/topaz/denoise.py b/topaz/denoise.py
index ba458b4..4deaf4f 100644
--- a/topaz/denoise.py
+++ b/topaz/denoise.py
@@ -11,6 +11,10 @@
 
 from topaz.utils.data.loader import load_image
 from topaz.filters import AffineFilter, AffineDenoise, GaussianDenoise, gaussian_filter, inverse_filter
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 
 def load_model(name):
@@ -100,15 +104,14 @@ def denoise_patches(model, x, patch_size, padding=128):
 
     return y
 
-def denoise_stack(model, stack, batch_size=20, use_cuda=False):
+def denoise_stack(model, stack, batch_size=20, device='cpu'):
     denoised = np.zeros_like(stack)
     with torch.no_grad():
         stack = torch.from_numpy(stack).float()
 
         for i in range(0, len(stack), batch_size):
             x = stack[i:i+batch_size]
-            if use_cuda:
-                x = x.cuda()
+            x = x.to(device)
             mu = x.view(x.size(0), -1).mean(1)
             std = x.view(x.size(0), -1).std(1)
             x = (x - mu.unsqueeze(1).unsqueeze(2))/std.unsqueeze(1).unsqueeze(2)
@@ -1017,7 +1020,7 @@ def __call__(self, x, y):
 
 
 def eval_noise2noise(model, dataset, criteria, batch_size=10
-                    , use_cuda=False, num_workers=0):
+                    , device='cpu', num_workers=0):
     data_iterator = torch.utils.data.DataLoader(dataset, batch_size=batch_size
                                                , num_workers=num_workers)
 
@@ -1025,12 +1028,13 @@ def eval_noise2noise(model, dataset, criteria, batch_size=10
     loss = 0
 
     model.eval()
+#    if 'ipex' in dir():
+#        model = ipex.optimize(model)
         
     with torch.no_grad():
         for x1,x2 in data_iterator:
-            if use_cuda:
-                x1 = x1.cuda()
-                x2 = x2.cuda()
+            x1 = x1.to(device)
+            x2 = x2.to(device)
 
             x1 = x1.unsqueeze(1)
             y = model(x1).squeeze(1)
@@ -1047,7 +1051,7 @@ def eval_noise2noise(model, dataset, criteria, batch_size=10
 
 def train_noise2noise(model, dataset, lr=0.001, optim='adagrad', batch_size=10, num_epochs=100
                      , criteria=nn.MSELoss(), dataset_val=None
-                     , use_cuda=False, num_workers=0, shuffle=True):
+                     , device='cpu', num_workers=0, shuffle=True):
 
     gamma = None
     if criteria == 'L0':
@@ -1072,6 +1076,8 @@ def train_noise2noise(model, dataset, lr=0.001, optim='adagrad', batch_size=10,
 
     for epoch in range(1, num_epochs+1):
         model.train()
+#        if 'ipex' in dir():
+#            model, optim = ipex.optimize(model, optimizer=optim)
         
         n = 0
         loss_accum = 0
@@ -1081,9 +1087,8 @@ def train_noise2noise(model, dataset, lr=0.001, optim='adagrad', batch_size=10,
             criteria.gamma = 2 - (epoch-1)*2/num_epochs
 
         for x1,x2 in data_iterator:
-            if use_cuda:
-                x1 = x1.cuda()
-                x2 = x2.cuda()
+            x1 = x1.to(device)
+            x2 = x2.to(device)
 
             x1 = x1.unsqueeze(1)
             y = model(x1).squeeze(1)
@@ -1109,7 +1114,7 @@ def train_noise2noise(model, dataset, lr=0.001, optim='adagrad', batch_size=10,
             loss_val = eval_noise2noise(model, dataset_val, criteria
                                        , batch_size=batch_size
                                        , num_workers=num_workers
-                                       , use_cuda=use_cuda
+                                       , device=device
                                        )
             yield epoch, loss_accum, loss_val
         else:
@@ -1117,7 +1122,7 @@ def train_noise2noise(model, dataset, lr=0.001, optim='adagrad', batch_size=10,
 
 
 def eval_mask_denoise(model, dataset, criteria, p=0.01 # masking rate
-                     , batch_size=10, use_cuda=False, num_workers=0):
+                     , batch_size=10, device='cpu', num_workers=0):
     data_iterator = torch.utils.data.DataLoader(dataset, batch_size=batch_size
                                                , num_workers=num_workers)
 
@@ -1125,6 +1130,8 @@ def eval_mask_denoise(model, dataset, criteria, p=0.01 # masking rate
     loss = 0
 
     model.eval()
+#    if 'ipex' in dir():
+#        model = ipex.optimize(model)
         
     with torch.no_grad():
         for x in data_iterator:
@@ -1132,10 +1139,9 @@ def eval_mask_denoise(model, dataset, criteria, p=0.01 # masking rate
             mask = (torch.rand(x.size()) < p)
             r = torch.randn(x.size())
 
-            if use_cuda:
-                x = x.cuda()
-                mask = mask.cuda()
-                r = r.cuda()
+            x = x.to(device)
+            mask = mask.to(device)
+            r = r.to(device)
 
             # mask out x by replacing from N(0,1)
             x_ = mask.float()*r + (1-mask.float())*x
@@ -1159,7 +1165,7 @@ def eval_mask_denoise(model, dataset, criteria, p=0.01 # masking rate
 
 def train_mask_denoise(model, dataset, p=0.01, lr=0.001, optim='adagrad', batch_size=10, num_epochs=100
                       , criteria=nn.MSELoss(), dataset_val=None
-                      , use_cuda=False, num_workers=0, shuffle=True):
+                      , device='cpu', num_workers=0, shuffle=True):
 
     gamma = None
     if criteria == 'L0':
@@ -1184,6 +1190,8 @@ def train_mask_denoise(model, dataset, p=0.01, lr=0.001, optim='adagrad', batch_
 
     for epoch in range(1, num_epochs+1):
         model.train()
+#        if 'ipex' in dir():
+#            model, optim = ipex.optimize(model, optimizer=optim)
         
         n = 0
         loss_accum = 0
@@ -1199,10 +1207,9 @@ def train_mask_denoise(model, dataset, p=0.01, lr=0.001, optim='adagrad', batch_
             mask = (torch.rand(x.size()) < p)
             r = torch.randn(x.size())
 
-            if use_cuda:
-                x = x.cuda()
-                mask = mask.cuda()
-                r = r.cuda()
+            x = x.to(device)
+            mask = mask.to(device)
+            r = r.to(device)
 
             # mask out x by replacing from N(0,1)
             x_ = mask.float()*r + (1-mask.float())*x
@@ -1233,7 +1240,7 @@ def train_mask_denoise(model, dataset, p=0.01, lr=0.001, optim='adagrad', batch_
             loss_val = eval_mask_denoise(model, dataset_val, criteria, p=p
                                         , batch_size=batch_size
                                         , num_workers=num_workers
-                                        , use_cuda=use_cuda
+                                        , device=device
                                         )
             yield epoch, loss_accum, loss_val
         else:
@@ -1266,18 +1273,16 @@ def lowpass(x, factor=1, dims=2):
     return f
 
 
-def gaussian(x, sigma=1, scale=5, use_cuda=False, dims=2):
+def gaussian(x, sigma=1, scale=5, device='cpu', dims=2):
     """
     Apply Gaussian filter with sigma to image. Truncates the kernel at scale times sigma pixels
     """
 
     f = GaussianDenoise(sigma, scale=scale, dims=dims)
-    if use_cuda:
-        f.cuda()
+    f.to(device)
 
     with torch.no_grad():
         x = torch.from_numpy(x).unsqueeze(0).unsqueeze(0)
-        if use_cuda:
-            x = x.cuda()
+        x = x.to(device)
         y = f(x).squeeze().cpu().numpy()
-    return y
\ No newline at end of file
+    return y
diff --git a/topaz/cuda.py b/topaz/gpu.py
similarity index 50%
rename from topaz/cuda.py
rename to topaz/gpu.py
index b9cfd65..72aab2b 100644
--- a/topaz/cuda.py
+++ b/topaz/gpu.py
@@ -2,6 +2,10 @@
 
 import warnings
 import torch
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 def _format(message, category, filename, lineno, line=None):
     w = '{}: {}\n'.format(category.__name__, message)
@@ -9,16 +13,26 @@ def _format(message, category, filename, lineno, line=None):
 warnings.formatwarning = _format
 
 
-class CudaWarning(UserWarning):
+class GpuWarning(UserWarning):
     pass
 
 
 def set_device(device, error=False, warn=True):
-    use_cuda = False
+    use_device = 'cpu'
     if device >= 0: # try to set GPU when device >= 0
-        use_cuda = torch.cuda.is_available()
+        if torch.cuda.is_available():
+            import torch.cuda as acc
+            use_device = 'cuda'
+        elif hasattr(torch,'xpu'):
+            if torch.xpu.is_available():
+                import torch.xpu as acc
+                use_device = 'xpu'
+            else:
+                import torch.cpu as acc
+        else:
+            import torch.cpu as acc
         try:
-            torch.cuda.set_device(device)
+            acc.set_device(device)
         except Exception as e:
             ## setting the device failed
             if error:
@@ -26,7 +40,8 @@ def set_device(device, error=False, warn=True):
             if warn:
                 # warn the user
                 message = str(e) + '\nFalling back to CPU.'
-                warnings.warn(message, CudaWarning)
+                warnings.warn(message, GpuWarning)
             # fallback to CPU
-            use_cuda = False
-    return use_cuda
+            use_device = 'cpu'
+    return use_device
+
diff --git a/topaz/methods.py b/topaz/methods.py
index 7e72c91..1f5432f 100644
--- a/topaz/methods.py
+++ b/topaz/methods.py
@@ -7,6 +7,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 def autoencoder_loss(model, X):
     X = X.unsqueeze(1)
@@ -123,8 +127,7 @@ def step(self, X, Y):
         ## KL of w from the binomial distribution with pi
         log_binom = scipy.stats.binom.logpmf(np.arange(0,N+1),N,self.pi)
         log_binom = torch.from_numpy(log_binom).float()
-        if q_var.is_cuda:
-            log_binom = log_binom.cuda()
+        log_binom = log_binom.to(q_var.device)
         log_binom = Variable(log_binom)
 
         ge_penalty = -torch.sum(log_binom*q_discrete)
diff --git a/topaz/predict.py b/topaz/predict.py
index 580209f..816bc5a 100644
--- a/topaz/predict.py
+++ b/topaz/predict.py
@@ -1,7 +1,10 @@
 from __future__ import absolute_import, print_function, division
 
 import torch
-
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 def batches(X, batch_size=1):
     batch = []
@@ -16,20 +19,19 @@ def batches(X, batch_size=1):
         yield batch
 
 
-def score_stream(model, images, use_cuda=False, batch_size=1):
+def score_stream(model, images, device='cpu', batch_size=1):
     with torch.no_grad():
         for x in batches(images, batch_size=batch_size):
             x = x.unsqueeze(1)
-            if use_cuda:
-                x = x.cuda()
+            x = x.to(device)
             logits = model(x).squeeze(1).cpu().numpy()
             for i in range(len(logits)):
                 yield logits[i]
 
 
-def score(model, images, use_cuda=False, batch_size=1):
+def score(model, images, device='cpu', batch_size=1):
     scores = []
-    for y in score_stream(model, images, use_cuda=use_cuda, batch_size=batch_size):
+    for y in score_stream(model, images, device=device, batch_size=batch_size):
         scores.append(y)
     return scores
 
diff --git a/topaz/stats.py b/topaz/stats.py
index 6299adf..1c482f0 100644
--- a/topaz/stats.py
+++ b/topaz/stats.py
@@ -4,10 +4,13 @@
 import scipy.stats
 
 import torch
-
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    pass
 
 def normalize(x, alpha=900, beta=1, num_iters=100, sample=1
-             , method='gmm', use_cuda=False, verbose=False):
+             , method='gmm', device='cpu', verbose=False):
     if method == 'affine':
         mu = x.mean()
         std = x.std()
@@ -32,7 +35,7 @@ def normalize(x, alpha=900, beta=1, num_iters=100, sample=1
 
     mu, std, pi, logp, mus, stds, pis, logps = norm_fit(x_sample, alpha=alpha, beta=beta
                                                        , scale=scale
-                                                       , num_iters=num_iters, use_cuda=use_cuda
+                                                       , num_iters=num_iters, device=device
                                                        , verbose=verbose)
 
     # normalize the data
@@ -57,7 +60,7 @@ def normalize(x, alpha=900, beta=1, num_iters=100, sample=1
 
 
 def norm_fit(x, alpha=900, beta=1, scale=1
-            , num_iters=100, use_cuda=False, verbose=False):
+            , num_iters=100, device='cpu', verbose=False):
     
     # try multiple initializations of pi
     pis = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.98, 1])
@@ -68,8 +71,7 @@ def norm_fit(x, alpha=900, beta=1, scale=1
     stds = np.zeros(len(pis))
 
     x = torch.from_numpy(x)
-    if use_cuda:
-        x = x.cuda()
+    x = x.to(device)
     
     for i in range(len(pis)):
         pi = pis[i]