diff --git a/.github/workflows/build_and_test_compiler_zoo.yml b/.github/workflows/build_and_test_compiler_zoo.yml
index 430e3f37..38ea93ae 100644
--- a/.github/workflows/build_and_test_compiler_zoo.yml
+++ b/.github/workflows/build_and_test_compiler_zoo.yml
@@ -14,7 +14,7 @@ jobs:
         image: dbwy/chemistry
     strategy:
       matrix:
-        compiler: [ {suite: gnu, version: 12}, {suite: llvm, version: 14} ]
+        compiler: [ {suite: gnu, version: 12} ]
         mpi_flag: [ON, OFF]
         openmp_flag: [ON, OFF]
         exclude:
diff --git a/.gitignore b/.gitignore
index d92d4623..65b35b0d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ src/xc_integrator/local_work_driver/host/obara_saika/test/*.x
 src/xc_integrator/local_work_driver/host/obara_saika/generator/integral*
 src/xc_integrator/local_work_driver/host/obara_saika/generator/obara*
 src/xc_integrator/local_work_driver/host/obara_saika/generator/*.x
+*.swp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 57cd29f8..94efc973 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required( VERSION 3.20 FATAL_ERROR )
 include(FetchContent)
 set( FETCHCONTENT_UPDATES_DISCONNECTED ON CACHE BOOL "Disable FC Updates" )
 
-project( GauXC VERSION 0.0.1 LANGUAGES C CXX )
+project( GauXC VERSION 1.0.0 LANGUAGES C CXX )
 
 # Place local modules in the path
 list( PREPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..686e5e7a
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,10 @@
+# Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+
+Resources:
+
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
+- Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..ebf23aca
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+# Contributing
+
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to,
+and actually do, grant us the rights to use your contribution. For details, visit
+https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the
+instructions provided by the bot. You will only need to do this once across all repositories using our CLA.
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
new file mode 100644
index 00000000..689d4e67
--- /dev/null
+++ b/CONTRIBUTORS.md
@@ -0,0 +1,17 @@
+# This is the list of GauXC's significant contributors.
+#
+# This does not necessarily list everyone who has contributed code.
+# To see the full list of contributors, see the revision history in
+# source control.
+
+Primary Developer and Maintainer: David Williams--Young - Microsoft (davidwillia at microsoft dot com)
+
+* Thom Popovici (LBNL) 
+* Teri Lambros (UW) 
+* Mikael Kovtun (UW)
+* Daniel Mejia-Rodriguez (PNNL)
+
+* Yingrong Chen (Microsoft)
+* Jiashu Liang (Microsoft)
+* David Clark (NVIDIA)
+* Damon McDougall (AMD)
diff --git a/LICENSE.txt b/LICENSE.txt
index c4c69413..f2904dad 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,6 +1,10 @@
-GauXC Copyright (c) 2020, The Regents of the University of California,
+GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 through Lawrence Berkeley National Laboratory (subject to receipt of
-any required approvals from the U.S. Dept. of Energy). All rights reserved.
+any required approvals from the U.S. Dept. of Energy).
+
+(c) 2024-2025, Microsoft Corporation
+
+All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/NOTICE.md b/NOTICE.md
new file mode 100644
index 00000000..4fcbf5de
--- /dev/null
+++ b/NOTICE.md
@@ -0,0 +1,38 @@
+# NOTICES
+
+This repository incorporates material as listed below or described in the code.
+
+-------------------------------------------------------------------------------
+gau2grid. 
+
+BSD 3-Clause License
+
+Copyright (c) 2017, Daniel Smith
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-------------------------------------------------------------------------------
+
diff --git a/README.md b/README.md
index 35fa5f97..082ac6cd 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,15 @@
 # About
 
-GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+GauXC 
+
+Copyright (c) 2020-2024, The Regents of the University of California,
 through Lawrence Berkeley National Laboratory (subject to receipt of
-any required approvals from the U.S. Dept. of Energy). All rights reserved.
+any required approvals from the U.S. Dept. of Energy). 
+
+(c) 2024-2025, Microsoft Corporation
+
+All rights reserved.
 
-If you have questions about your rights to use or distribute this software,
-please contact Berkeley Lab's Intellectual Property Office at
-IPO@lbl.gov.
 
 NOTICE.  This Software was developed under funding from the U.S. Department
 of Energy and the U.S. Government consequently retains certain rights.  As
@@ -29,12 +32,7 @@ frameworks to target NVIDIA and AMD GPUs, respectively.
 Evaluation
 of the XC functional CPU/accelerator architectures is provided by the
 [ExchCXX](https://github.com/wavefunction91/ExchCXX) library. Quadratures are generated
-by the [IntegratorXX](https://github.com/wavefunction91/IntegratorXX).
-
-GauXC is a work in progress. Its development has been funded by the U.S.
-Department of Energy Exascale Computing Project
-([NWChemEx](https://github.com/NWChemEx-Project)).
-
+by the [IntegratorXX](https://github.com/wavefunction91/IntegratorXX) library.
 
 # Design Goals
 
@@ -62,17 +60,7 @@ for flexible and agile development in the field of KS-DFT.
 
 # Major Contributors
 
-Primary Developer and Maintainer: David Williams--Young - LBNL (dbwy at lbl dot gov)
-
-GauXC has received major contributions from the following developers (in no particular order):
-* Thom Popovici (LBNL)          - Optimized sn-K kernels for CPU and GPU architectures
-* Teri Lambros (UW)             - Unrestricted (UKS) and Generalized (GKS) DFT
-* Daniel Mejia-Rodriguez (PNNL) - Meta-GGA DFT
-
-We have also receieved significant support from industry collaborators:
-* David Clark (NVIDIA)  - Optimization of critical kernels for NVIDIA architectures
-* Damon McDougall (AMD) - Optimization of critical kernels for AMDGPU architectures
-
+See CONTRIBUTORS.md for a list of major contributors to GauXC.
 
 # Publications
 
@@ -229,7 +217,7 @@ target_link_libraries( my_target PUBLIC gauxc::gauxc )
 
 # Example Usage
 
-Coming Soon.... See `test/standalone_driver.cxx` for an example end-to-end invocation of GauXC for various integrands.
+See `test/standalone_driver.cxx` for an example end-to-end invocation of GauXC for various integrands.
 
 
 # License
@@ -239,6 +227,15 @@ LICENSE.txt for details.
 
 # Acknowledgments
 
-The development of GauXC is supported by the Exascale Computing Project
+The development of GauXC was previously supported by the Exascale Computing Project
 (17-SC-20-SC), a collaborative effort of the U.S. Department of Energy Office
 of Science and the National Nuclear Security Administration.
+
+##Trademarks 
+
+This project may contain trademarks or logos for projects, products, or
+services. Authorized use of Microsoft trademarks or logos is subject to and
+must follow Microsoft’s Trademark & Brand Guidelines. Use of Microsoft
+trademarks or logos in modified versions of this project must not cause
+confusion or imply Microsoft sponsorship. Any use of third-party trademarks or
+logos are subject to those third-party’s policies.
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 00000000..656f7918
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,14 @@
+<!-- BEGIN MICROSOFT SECURITY.MD V1.0.0 BLOCK -->
+
+## Security
+
+Microsoft takes the security of our software products and services seriously, which
+includes all source code repositories in our GitHub organizations.
+
+**Please do not report security vulnerabilities through public GitHub issues.**
+
+For security reporting information, locations, contact information, and policies,
+please review the latest guidance for Microsoft repositories at
+[https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md).
+
+<!-- END MICROSOFT SECURITY.MD BLOCK -->
diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake
index cd3969d8..8ab0aa11 100644
--- a/cmake/gauxc-dep-versions.cmake
+++ b/cmake/gauxc-dep-versions.cmake
@@ -11,13 +11,13 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git )
 set( GAUXC_CUTLASS_REVISION v2.10.0 )
 
 set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git )
-set( GAUXC_EXCHCXX_REVISION   21a4700a826ec0beae1311a1d59677393bcb168f )
+set( GAUXC_EXCHCXX_REVISION   v1.0.0 )
 
 set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git )
 set( GAUXC_GAU2GRID_REVISION   v2.0.6 )
 
 set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/wavefunction91/IntegratorXX.git )
-set( GAUXC_INTEGRATORXX_REVISION   ea07dedd37e7bd49ea06394eb811599002b34b49 )
+set( GAUXC_INTEGRATORXX_REVISION   cf2917c64916583cef1081011beab3085b66e352 )
 
 set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/BlueBrain/HighFive.git )
 set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 )
diff --git a/include/gauxc/atom.hpp b/include/gauxc/atom.hpp
index 3f9771c0..72b0673b 100644
--- a/include/gauxc/atom.hpp
+++ b/include/gauxc/atom.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/basisset.hpp b/include/gauxc/basisset.hpp
index 9cef7ee7..c0c0f839 100644
--- a/include/gauxc/basisset.hpp
+++ b/include/gauxc/basisset.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/basisset_map.hpp b/include/gauxc/basisset_map.hpp
index ad9acb94..53f6d9d8 100644
--- a/include/gauxc/basisset_map.hpp
+++ b/include/gauxc/basisset_map.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/enums.hpp b/include/gauxc/enums.hpp
index ce7c19e1..76d4500c 100644
--- a/include/gauxc/enums.hpp
+++ b/include/gauxc/enums.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,9 +19,10 @@ namespace GauXC {
  *  Generally mapped to equivalent enums in IntegratorXX
  */
 enum class RadialQuad {
+  Becke,             ///< Becke radial quadrature
   MuraKnowles,       ///< Mura-Knowles radial quadrature
   MurrayHandyLaming, ///< Murray-Handy-Laming radial quadrature
-  TreutlerAldrichs   ///< Treutler-Aldrichs radial quadrature
+  TreutlerAhlrichs   ///< Treutler-Ahlrichs radial quadrature
 };
 
 /**
@@ -29,8 +34,8 @@ enum class AtomicGridSizeDefault {
   FineGrid,       ///< Fine grid      (least accurate)
   UltraFineGrid,  ///< Ultrafine grid (appropriate accuracy)
   SuperFineGrid,  ///< Superfine grid (most accurate)
-  GM3,            ///< Treutler-Aldrichs GM3
-  GM5             ///< Treutlet-Aldrichs GM5
+  GM3,            ///< Treutler-Ahlrichs GM3
+  GM5             ///< Treutlet-Ahlrichs GM5
 };
 
 /**
@@ -38,6 +43,7 @@ enum class AtomicGridSizeDefault {
  *  molecular integration
  */
 enum class XCWeightAlg {
+  NOTPARTITIONED, ///< Not partitioned
   Becke, ///< The original Becke weighting scheme
   SSF,   ///< The Stratmann-Scuseria-Frisch weighting scheme
   LKO    ///< The Lauqua-Kuessman-Ochsenfeld weighting scheme
diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp
index ac16bfbd..84b9b489 100644
--- a/include/gauxc/exceptions.hpp
+++ b/include/gauxc/exceptions.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/external/cereal.hpp b/include/gauxc/external/cereal.hpp
index c4cd1a90..ba0b6ef9 100644
--- a/include/gauxc/external/cereal.hpp
+++ b/include/gauxc/external/cereal.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/external/hdf5.hpp b/include/gauxc/external/hdf5.hpp
index 8d7ad01e..434d0893 100644
--- a/include/gauxc/external/hdf5.hpp
+++ b/include/gauxc/external/hdf5.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/gauxc_config.hpp.in b/include/gauxc/gauxc_config.hpp.in
index a7f0ce69..86fe7485 100644
--- a/include/gauxc/gauxc_config.hpp.in
+++ b/include/gauxc/gauxc_config.hpp.in
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/grid.hpp b/include/gauxc/grid.hpp
index 8a45e2f3..af7f8f2a 100644
--- a/include/gauxc/grid.hpp
+++ b/include/gauxc/grid.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/grid_factory.hpp b/include/gauxc/grid_factory.hpp
index 70ecbb88..ecf65526 100644
--- a/include/gauxc/grid_factory.hpp
+++ b/include/gauxc/grid_factory.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -52,7 +56,7 @@ PrunedAtomicGridSpecification robust_psi4_pruning_scheme(
   UnprunedAtomicGridSpecification
 );
 
-/// Generate a Pruning specification according to the Treutler-Aldrichs scheme from an unpruned specification
+/// Generate a Pruning specification according to the Treutler-Ahlrichs scheme from an unpruned specification
 PrunedAtomicGridSpecification treutler_pruning_scheme(
   UnprunedAtomicGridSpecification
 );
@@ -61,7 +65,7 @@ PrunedAtomicGridSpecification treutler_pruning_scheme(
 enum class PruningScheme {
   Unpruned, /// Unpruned atomic quadrature
   Robust,   /// The "Robust" scheme of Psi4
-  Treutler  /// The Treutler-Aldrichs scheme
+  Treutler  /// The Treutler-Ahlrichs scheme
 };
 
 /// Generate a pruning specification from a specificed pruning scheme and 
diff --git a/include/gauxc/load_balancer.hpp b/include/gauxc/load_balancer.hpp
index 738464f2..d420656a 100644
--- a/include/gauxc/load_balancer.hpp
+++ b/include/gauxc/load_balancer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,6 +19,7 @@
 #include <gauxc/xc_task.hpp>
 #include <gauxc/util/timer.hpp>
 #include <gauxc/runtime_environment.hpp>
+#include <gauxc/enums.hpp>
 
 namespace GauXC {
 
@@ -27,6 +32,8 @@ namespace detail {
 struct LoadBalancerState {
   bool modified_weights_are_stored = false; 
     ///< Whether the load balancer currently stores partitioned weights
+  XCWeightAlg weight_alg = XCWeightAlg::NOTPARTITIONED; 
+    ///< Weight partitioning scheme used by this LoadBalancer
 };
 
 
@@ -77,6 +84,9 @@ class LoadBalancer {
   /// Return internal timing tracker
   const util::Timer& get_timings() const;
 
+  /// Return the total number of points for local tasks
+  size_t total_npts() const;
+
   /// Return the maximum number of points for local tasks
   size_t max_npts()       const;
 
diff --git a/include/gauxc/molecular_weights.hpp b/include/gauxc/molecular_weights.hpp
index ed2e7d3f..74f1e922 100644
--- a/include/gauxc/molecular_weights.hpp
+++ b/include/gauxc/molecular_weights.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/molecule.hpp b/include/gauxc/molecule.hpp
index ce9aaa6a..9f4fe6a7 100644
--- a/include/gauxc/molecule.hpp
+++ b/include/gauxc/molecule.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/molgrid.hpp b/include/gauxc/molgrid.hpp
index 40bfd48d..d58dc494 100644
--- a/include/gauxc/molgrid.hpp
+++ b/include/gauxc/molgrid.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/molgrid/defaults.hpp b/include/gauxc/molgrid/defaults.hpp
index f3c3bf5e..0565647d 100644
--- a/include/gauxc/molgrid/defaults.hpp
+++ b/include/gauxc/molgrid/defaults.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -14,6 +18,7 @@ namespace GauXC {
   double slater_radius_64(AtomicNumber);
   double slater_radius_30(AtomicNumber);
   double clementi_radius_67(AtomicNumber);
+  double uff_radius_103(AtomicNumber);
   double default_atomic_radius(AtomicNumber);
 
   RadialScale default_mk_radial_scaling_factor( AtomicNumber );
diff --git a/include/gauxc/molmeta.hpp b/include/gauxc/molmeta.hpp
index cd6ee8d6..12918c6e 100644
--- a/include/gauxc/molmeta.hpp
+++ b/include/gauxc/molmeta.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/named_type.hpp b/include/gauxc/named_type.hpp
index 7034df8e..cf7a776c 100644
--- a/include/gauxc/named_type.hpp
+++ b/include/gauxc/named_type.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/reduction_driver.hpp b/include/gauxc/reduction_driver.hpp
index 1d9eaa96..f3bef188 100644
--- a/include/gauxc/reduction_driver.hpp
+++ b/include/gauxc/reduction_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/runtime_environment.hpp b/include/gauxc/runtime_environment.hpp
index 84edd290..4b0b08f5 100644
--- a/include/gauxc/runtime_environment.hpp
+++ b/include/gauxc/runtime_environment.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/runtime_environment/decl.hpp b/include/gauxc/runtime_environment/decl.hpp
index 5cc63fb0..424f9d98 100644
--- a/include/gauxc/runtime_environment/decl.hpp
+++ b/include/gauxc/runtime_environment/decl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -77,6 +81,8 @@ class DeviceRuntimeEnvironment : public RuntimeEnvironment {
   bool owns_memory() const;
   DeviceBackend* device_backend() const;
 
+  void release_buffer();
+  void set_buffer(void* m, size_t sz);
 };
 #endif
 
diff --git a/include/gauxc/runtime_environment/fwd.hpp b/include/gauxc/runtime_environment/fwd.hpp
index 58910933..23f726e9 100644
--- a/include/gauxc/runtime_environment/fwd.hpp
+++ b/include/gauxc/runtime_environment/fwd.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/shell.hpp b/include/gauxc/shell.hpp
index f75a4949..7f27170c 100644
--- a/include/gauxc/shell.hpp
+++ b/include/gauxc/shell.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -225,7 +229,6 @@ class alignas(256) Shell {
 };
 
 
-#if 0
 template <typename T>
 inline std::ostream& operator<<( std::ostream& os, const Shell<T>& sh ) {
     os << "GauXC::Shell:( O={" 
@@ -234,9 +237,6 @@ inline std::ostream& operator<<( std::ostream& os, const Shell<T>& sh ) {
     os << "  ";
     os << " {l=" << sh.l() << ",sph=" << sh.pure() << "}";
     os << std::endl;
-    os << " {cr=" << sh.cutoff_radius() << ",cv=" << sh.cutoff_val() 
-	    <<",mr=" << sh.max_radius() << ",mv=" << sh.max_val() << "}";
-    os << std::endl;
 
     for(auto i=0ul; i<sh.nprim(); ++i) {
       os << "  " << sh.alpha()[i];
@@ -246,6 +246,5 @@ inline std::ostream& operator<<( std::ostream& os, const Shell<T>& sh ) {
 
     return os;
 }
-#endif
 
 }
diff --git a/include/gauxc/shell_pair.hpp b/include/gauxc/shell_pair.hpp
index e3288198..643ab8e3 100644
--- a/include/gauxc/shell_pair.hpp
+++ b/include/gauxc/shell_pair.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/types.hpp b/include/gauxc/types.hpp
index 2c1b7c8d..aad5d9ba 100644
--- a/include/gauxc/types.hpp
+++ b/include/gauxc/types.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/constexpr_math.hpp b/include/gauxc/util/constexpr_math.hpp
index bfe3be4d..3d8e9d87 100644
--- a/include/gauxc/util/constexpr_math.hpp
+++ b/include/gauxc/util/constexpr_math.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/contiguous_container_data.hpp b/include/gauxc/util/contiguous_container_data.hpp
index d29a6131..f5d35fd1 100644
--- a/include/gauxc/util/contiguous_container_data.hpp
+++ b/include/gauxc/util/contiguous_container_data.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/div_ceil.hpp b/include/gauxc/util/div_ceil.hpp
index 772d6c74..8a39aa67 100644
--- a/include/gauxc/util/div_ceil.hpp
+++ b/include/gauxc/util/div_ceil.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/environment.hpp b/include/gauxc/util/environment.hpp
index 953c5334..2a0a98d5 100644
--- a/include/gauxc/util/environment.hpp
+++ b/include/gauxc/util/environment.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/gau_rad_eval.hpp b/include/gauxc/util/gau_rad_eval.hpp
index 47f896bf..b2aa7f91 100644
--- a/include/gauxc/util/gau_rad_eval.hpp
+++ b/include/gauxc/util/gau_rad_eval.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/geometry.hpp b/include/gauxc/util/geometry.hpp
index 62992dca..97a8da2a 100644
--- a/include/gauxc/util/geometry.hpp
+++ b/include/gauxc/util/geometry.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/misc.hpp b/include/gauxc/util/misc.hpp
index 671f34c6..cf2ef8f0 100644
--- a/include/gauxc/util/misc.hpp
+++ b/include/gauxc/util/misc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/real_solid_harmonics.hpp b/include/gauxc/util/real_solid_harmonics.hpp
index 9501e557..3394da02 100644
--- a/include/gauxc/util/real_solid_harmonics.hpp
+++ b/include/gauxc/util/real_solid_harmonics.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/timer.hpp b/include/gauxc/util/timer.hpp
index a70d9298..545fa35b 100644
--- a/include/gauxc/util/timer.hpp
+++ b/include/gauxc/util/timer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/util/unused.hpp b/include/gauxc/util/unused.hpp
index cd993122..e6dd054a 100644
--- a/include/gauxc/util/unused.hpp
+++ b/include/gauxc/util/unused.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/xc_integrator.hpp b/include/gauxc/xc_integrator.hpp
index e08da39c..03feaf93 100644
--- a/include/gauxc/xc_integrator.hpp
+++ b/include/gauxc/xc_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -36,6 +40,10 @@ class XCIntegrator {
   using exc_vxc_type_gks  = std::tuple< value_type, matrix_type, matrix_type, matrix_type, matrix_type >;
   using exc_grad_type = std::vector< value_type >;
   using exx_type      = matrix_type;
+  using fxc_contraction_type_rks = matrix_type;
+  using fxc_contraction_type_uks = std::tuple< matrix_type, matrix_type >;
+  using dd_psi_type   = std::vector< value_type >;
+  using dd_psi_potential_type   = matrix_type;
 
 private:
 
@@ -66,11 +74,19 @@ class XCIntegrator {
   exc_vxc_type_gks  eval_exc_vxc ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&,
                                    const IntegratorSettingsXC& = IntegratorSettingsXC{});
 
-  exc_grad_type eval_exc_grad( const MatrixType& );
+  exc_grad_type eval_exc_grad( const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} );
+  exc_grad_type eval_exc_grad( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} );
 
   exx_type      eval_exx     ( const MatrixType&, 
                                const IntegratorSettingsEXX& = IntegratorSettingsEXX{} );
 
+  fxc_contraction_type_rks  eval_fxc_contraction ( const MatrixType&, const MatrixType&,
+                                  const IntegratorSettingsXC& = IntegratorSettingsXC{} );
+  fxc_contraction_type_uks  eval_fxc_contraction ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&,
+                                  const IntegratorSettingsXC& = IntegratorSettingsXC{} );
+
+  dd_psi_type eval_dd_psi( const MatrixType&, unsigned );
+  dd_psi_potential_type eval_dd_psi_potential( const MatrixType&, unsigned );
 
   const util::Timer& get_timings() const;
   const LoadBalancer& load_balancer() const;
diff --git a/include/gauxc/xc_integrator/impl.hpp b/include/gauxc/xc_integrator/impl.hpp
index 85a655cc..400afb7c 100644
--- a/include/gauxc/xc_integrator/impl.hpp
+++ b/include/gauxc/xc_integrator/impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -76,9 +80,16 @@ typename XCIntegrator<MatrixType>::exc_vxc_type_gks
 
 template <typename MatrixType>
 typename XCIntegrator<MatrixType>::exc_grad_type
-  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& P ) {
+  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) {
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  return pimpl_->eval_exc_grad(P, ks_settings);
+};
+
+template <typename MatrixType>
+typename XCIntegrator<MatrixType>::exc_grad_type
+  XCIntegrator<MatrixType>::eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) {
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
-  return pimpl_->eval_exc_grad(P);
+  return pimpl_->eval_exc_grad(Ps, Pz, ks_settings);
 };
 
 template <typename MatrixType>
@@ -89,6 +100,37 @@ typename XCIntegrator<MatrixType>::exx_type
   return pimpl_->eval_exx(P,settings);
 };
 
+template <typename MatrixType>
+typename XCIntegrator<MatrixType>::fxc_contraction_type_rks
+  XCIntegrator<MatrixType>::eval_fxc_contraction( const MatrixType& P, const MatrixType& tP, 
+                                               const IntegratorSettingsXC& ks_settings ) { 
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  return pimpl_->eval_fxc_contraction(P, tP, ks_settings);
+};
+
+template <typename MatrixType>
+typename XCIntegrator<MatrixType>::fxc_contraction_type_uks
+  XCIntegrator<MatrixType>::eval_fxc_contraction( const MatrixType& Ps, const MatrixType& Pz, 
+                           const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) { 
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  return pimpl_->eval_fxc_contraction(Ps, Pz, tPs, tPz, ks_settings);
+};
+
+template <typename MatrixType>
+typename XCIntegrator<MatrixType>::dd_psi_type
+  XCIntegrator<MatrixType>::eval_dd_psi(const MatrixType& P, unsigned max_Ylm) {
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  return pimpl_->eval_dd_psi(P, max_Ylm);
+}
+
+template <typename MatrixType>
+typename XCIntegrator<MatrixType>::dd_psi_potential_type
+  XCIntegrator<MatrixType>::eval_dd_psi_potential(const MatrixType& X, unsigned max_Ylm) {
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  return pimpl_->eval_dd_psi_potential(X, max_Ylm);
+}
+
+
 template <typename MatrixType>
 const util::Timer& XCIntegrator<MatrixType>::get_timings() const {
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
diff --git a/include/gauxc/xc_integrator/integrator_factory.hpp b/include/gauxc/xc_integrator/integrator_factory.hpp
index d63d23be..54a1c4a3 100644
--- a/include/gauxc/xc_integrator/integrator_factory.hpp
+++ b/include/gauxc/xc_integrator/integrator_factory.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/xc_integrator/local_work_driver.hpp b/include/gauxc/xc_integrator/local_work_driver.hpp
index bb37b319..50eb3d32 100644
--- a/include/gauxc/xc_integrator/local_work_driver.hpp
+++ b/include/gauxc/xc_integrator/local_work_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/xc_integrator/replicated/impl.hpp b/include/gauxc/xc_integrator/replicated/impl.hpp
index a892f5e3..bfc95fc8 100644
--- a/include/gauxc/xc_integrator/replicated/impl.hpp
+++ b/include/gauxc/xc_integrator/replicated/impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -159,13 +163,27 @@ typename ReplicatedXCIntegrator<MatrixType>::exc_vxc_type_gks
 
 template <typename MatrixType>
 typename ReplicatedXCIntegrator<MatrixType>::exc_grad_type 
-  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& P ) {
+  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) {
 
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
 
   std::vector<value_type> EXC_GRAD( 3*pimpl_->load_balancer().molecule().natoms() );
   pimpl_->eval_exc_grad( P.rows(), P.cols(), P.data(), P.rows(),
-                         EXC_GRAD.data() );
+                         EXC_GRAD.data(), ks_settings );
+
+  return EXC_GRAD;
+
+}
+
+template <typename MatrixType>
+typename ReplicatedXCIntegrator<MatrixType>::exc_grad_type 
+  ReplicatedXCIntegrator<MatrixType>::eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) {
+
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+
+  std::vector<value_type> EXC_GRAD( 3*pimpl_->load_balancer().molecule().natoms() );
+  pimpl_->eval_exc_grad( Ps.rows(), Ps.cols(), Ps.data(), Ps.rows(), Pz.data(), Pz.rows(),
+                         EXC_GRAD.data(), ks_settings );
 
   return EXC_GRAD;
 
@@ -184,6 +202,67 @@ typename ReplicatedXCIntegrator<MatrixType>::exx_type
 
   return K;
 
+}
+template <typename MatrixType>
+typename ReplicatedXCIntegrator<MatrixType>::fxc_contraction_type_rks
+  ReplicatedXCIntegrator<MatrixType>::eval_fxc_contraction_( const MatrixType& P, 
+    const MatrixType& tP, const IntegratorSettingsXC& ks_settings ) {
+
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  matrix_type FXC( P.rows(), P.cols() );
+
+  pimpl_->eval_fxc_contraction( P.rows(), P.cols(), P.data(), P.rows(),
+                        tP.data(), tP.rows(),
+                        FXC.data(), FXC.rows(), ks_settings );
+
+  return FXC;
+}
+
+template <typename MatrixType>
+typename ReplicatedXCIntegrator<MatrixType>::fxc_contraction_type_uks
+  ReplicatedXCIntegrator<MatrixType>::eval_fxc_contraction_( const MatrixType& Ps, const MatrixType& Pz, 
+    const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) {
+
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  matrix_type FXCs( Ps.rows(), Ps.cols() );
+  matrix_type FXCz( Pz.rows(), Pz.cols() );
+
+  pimpl_->eval_fxc_contraction( Ps.rows(), Ps.cols(), Ps.data(), Ps.rows(),
+                        Pz.data(), Pz.rows(),
+                        tPs.data(), tPs.rows(),
+                        tPz.data(), tPz.rows(),
+                        FXCs.data(), FXCs.rows(),
+                        FXCz.data(), FXCz.rows(), ks_settings );
+
+  return std::make_tuple( FXCs, FXCz );
+
+}
+
+template <typename MatrixType>
+typename ReplicatedXCIntegrator<MatrixType>::dd_psi_type
+  ReplicatedXCIntegrator<MatrixType>::eval_dd_psi_( const MatrixType& P, unsigned max_Ylm ) {
+
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+
+  const size_t natoms = pimpl_->load_balancer().molecule().natoms();
+  const size_t Ylm_sz = (max_Ylm + 1) * ( max_Ylm + 1);
+  std::vector<value_type> ddPsi(natoms * Ylm_sz, 0.0);
+  pimpl_->eval_dd_psi(P.rows(), P.cols(), P.data(), P.rows(), max_Ylm, ddPsi.data(), Ylm_sz);
+  return ddPsi;
+}
+
+template <typename MatrixType>
+typename ReplicatedXCIntegrator<MatrixType>::dd_psi_potential_type
+  ReplicatedXCIntegrator<MatrixType>::eval_dd_psi_potential_( const MatrixType& X, unsigned max_Ylm ) {
+
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+
+  const size_t nbf = pimpl_->load_balancer().basis().nbf();
+  matrix_type Vddx(nbf, nbf);
+  Vddx.setZero(); 
+  pimpl_->eval_dd_psi_potential(X.rows(), X.cols(), X.data(), max_Ylm, Vddx.data());
+  return Vddx;                      
+
 }
 
 }
diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp
index 4721243c..9454e60e 100644
--- a/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp
+++ b/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp
index 57685599..4f3476f1 100644
--- a/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp
+++ b/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp
index d75a92d8..dc881b1f 100644
--- a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp
+++ b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp
index 70c33db5..45731512 100644
--- a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp
+++ b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -74,11 +78,30 @@ class ReplicatedXCIntegratorImpl {
                               value_type* VXCx, int64_t ldvxcx,
                               value_type* EXC, const IntegratorSettingsXC& ks_settings ) = 0;
 
-  virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                               int64_t ldp, value_type* EXC_GRAD ) = 0;
+  virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
+                               value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) = 0;
+  virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldps, 
+                               const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual void eval_exx_( int64_t m, int64_t n, const value_type* P,
                           int64_t ldp, value_type* K, int64_t ldk,
                           const IntegratorSettingsEXX& settings ) = 0;
+  virtual void eval_fxc_contraction_( int64_t m, int64_t n, 
+                            const value_type* P, int64_t ldp,
+                            const value_type* tP, int64_t ldtp,
+                            value_type* FXC, int64_t ldfxc,
+                            const IntegratorSettingsXC& ks_settings )=0;
+  virtual void eval_fxc_contraction_( int64_t m, int64_t n, 
+                            const value_type* Ps, int64_t ldps,   
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            value_type* FXCs, int64_t ldfxcs,
+                            value_type* FXCz, int64_t ldfxcz,
+                            const IntegratorSettingsXC& ks_settings )=0;
+  virtual void eval_dd_psi_( int64_t m, int64_t n, const value_type* P, int64_t ldp, unsigned max_Ylm, 
+                             value_type* ddPsi, int64_t ldPsi ) = 0;
+  virtual void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm,
+                             value_type* Vddx) = 0;
 
 public:
 
@@ -130,13 +153,36 @@ class ReplicatedXCIntegratorImpl {
                      value_type* EXC, const IntegratorSettingsXC& ks_settings );
 
 
-  void eval_exc_grad( int64_t m, int64_t n, const value_type* P,
-                      int64_t ldp, value_type* EXC_GRAD );
+  void eval_exc_grad( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
+                      value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings );
+  void eval_exc_grad( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                      const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings );
 
   void eval_exx( int64_t m, int64_t n, const value_type* P,
                  int64_t ldp, value_type* K, int64_t ldk,
                  const IntegratorSettingsEXX& settings );
 
+  void eval_fxc_contraction( int64_t m, int64_t n, const value_type* P,
+                      int64_t ldp,
+                      const value_type* tP, int64_t ldtp,
+                      value_type* FXC, int64_t ldfxc,
+                      const IntegratorSettingsXC& ks_settings );
+
+  void eval_fxc_contraction( int64_t m, int64_t n, const value_type* Ps,
+                      int64_t ldps,
+                      const value_type* Pz, int64_t ldpz,
+                      const value_type* tPs, int64_t ldtps,
+                      const value_type* tPz, int64_t ldtpz,
+                      value_type* FXCs, int64_t ldfxcs,
+                      value_type* FXCz, int64_t ldfxcz,
+                      const IntegratorSettingsXC& ks_settings );
+
+  void eval_dd_psi( int64_t m, int64_t n, const value_type* P,
+                     int64_t ldp, unsigned max_Ylm, 
+                     value_type* ddPsi, int64_t ldPsi );
+  void eval_dd_psi_potential( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, 
+                      value_type* Vddx );
+
   inline const util::Timer& get_timings() const { return timer_; }
 
   inline std::unique_ptr< LocalWorkDriver > release_local_work_driver() {
diff --git a/include/gauxc/xc_integrator/replicated_xc_integrator.hpp b/include/gauxc/xc_integrator/replicated_xc_integrator.hpp
index ac93a4f0..1ca53f91 100644
--- a/include/gauxc/xc_integrator/replicated_xc_integrator.hpp
+++ b/include/gauxc/xc_integrator/replicated_xc_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -33,6 +37,10 @@ class ReplicatedXCIntegrator : public XCIntegratorImpl<MatrixType> {
   using exc_vxc_type_gks   = typename XCIntegratorImpl<MatrixType>::exc_vxc_type_gks;
   using exc_grad_type  = typename XCIntegratorImpl<MatrixType>::exc_grad_type;
   using exx_type       = typename XCIntegratorImpl<MatrixType>::exx_type;
+  using fxc_contraction_type_rks   = typename XCIntegratorImpl<MatrixType>::fxc_contraction_type_rks;
+  using fxc_contraction_type_uks   = typename XCIntegratorImpl<MatrixType>::fxc_contraction_type_uks;
+  using dd_psi_type       = typename XCIntegratorImpl<MatrixType>::dd_psi_type;
+  using dd_psi_potential_type       = typename XCIntegratorImpl<MatrixType>::dd_psi_potential_type;
 
 private:
 
@@ -46,8 +54,13 @@ class ReplicatedXCIntegrator : public XCIntegratorImpl<MatrixType> {
   exc_vxc_type_rks  eval_exc_vxc_ ( const MatrixType&, const IntegratorSettingsXC& ) override;
   exc_vxc_type_uks  eval_exc_vxc_ ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC&) override;
   exc_vxc_type_gks  eval_exc_vxc_ ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override;
-  exc_grad_type eval_exc_grad_( const MatrixType& ) override;
+  exc_grad_type eval_exc_grad_( const MatrixType&, const IntegratorSettingsXC& ) override;
+  exc_grad_type eval_exc_grad_( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override;
   exx_type      eval_exx_     ( const MatrixType&, const IntegratorSettingsEXX& ) override;
+  fxc_contraction_type_rks  eval_fxc_contraction_ ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override;
+  fxc_contraction_type_uks  eval_fxc_contraction_ ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC&) override;
+  dd_psi_type   eval_dd_psi_( const MatrixType& , unsigned ) override;
+  dd_psi_potential_type   eval_dd_psi_potential_( const MatrixType& , unsigned ) override;
   const util::Timer& get_timings_() const override;
   const LoadBalancer& get_load_balancer_() const override;
   LoadBalancer& get_load_balancer_() override;
diff --git a/include/gauxc/xc_integrator/xc_integrator_impl.hpp b/include/gauxc/xc_integrator/xc_integrator_impl.hpp
index 1406bf8e..ba7bebeb 100644
--- a/include/gauxc/xc_integrator/xc_integrator_impl.hpp
+++ b/include/gauxc/xc_integrator/xc_integrator_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -25,6 +29,10 @@ class XCIntegratorImpl {
   using exc_vxc_type_gks   = typename XCIntegrator<MatrixType>::exc_vxc_type_gks;
   using exc_grad_type  = typename XCIntegrator<MatrixType>::exc_grad_type;
   using exx_type       = typename XCIntegrator<MatrixType>::exx_type;
+  using fxc_contraction_type_rks   = typename XCIntegrator<MatrixType>::fxc_contraction_type_rks;
+  using fxc_contraction_type_uks   = typename XCIntegrator<MatrixType>::fxc_contraction_type_uks;
+  using dd_psi_type       = typename XCIntegrator<MatrixType>::dd_psi_type;
+  using dd_psi_potential_type       = typename XCIntegrator<MatrixType>::dd_psi_potential_type;
 
 protected:
 
@@ -38,9 +46,18 @@ class XCIntegratorImpl {
   virtual exc_vxc_type_uks  eval_exc_vxc_ ( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual exc_vxc_type_gks  eval_exc_vxc_ ( const MatrixType& Ps, const MatrixType& Pz, const MatrixType& Py, const MatrixType& Px, 
                                             const IntegratorSettingsXC& ks_settings ) = 0;
-  virtual exc_grad_type eval_exc_grad_( const MatrixType& P ) = 0;
+  virtual exc_grad_type eval_exc_grad_( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) = 0;
+  virtual exc_grad_type eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) = 0;
   virtual exx_type      eval_exx_     ( const MatrixType&     P, 
                                         const IntegratorSettingsEXX& settings ) = 0;
+  virtual fxc_contraction_type_rks  eval_fxc_contraction_ ( const MatrixType& P,
+    const MatrixType& tP, const IntegratorSettingsXC& ks_settings ) = 0;
+  virtual fxc_contraction_type_uks  eval_fxc_contraction_ ( const MatrixType& Ps, const MatrixType& Pz, 
+    const MatrixType& tPs, const MatrixType& tPz,  const IntegratorSettingsXC& ks_settings ) = 0;
+
+
+  virtual dd_psi_type   eval_dd_psi_( const MatrixType& P, unsigned max_Ylm ) = 0;
+  virtual dd_psi_potential_type   eval_dd_psi_potential_( const MatrixType& X, unsigned max_Ylm ) = 0;
   virtual const util::Timer& get_timings_() const = 0;
   virtual const LoadBalancer& get_load_balancer_() const = 0;
   virtual LoadBalancer& get_load_balancer_() = 0;
@@ -108,14 +125,21 @@ class XCIntegratorImpl {
   }
 
   /** Integrate EXC gradient for RKS
-   * 
-   *   TODO: add API for UKS/GKS
    *
    *  @param[in] P The alpha density matrix
    *  @returns EXC gradient
    */
-  exc_grad_type eval_exc_grad( const MatrixType& P ) {
-    return eval_exc_grad_(P);
+  exc_grad_type eval_exc_grad( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) {
+    return eval_exc_grad_(P, ks_settings);
+  }
+
+  /** Integrate EXC gradient for UKS
+   *
+   *  @param[in] P The alpha density matrix
+   *  @returns EXC gradient
+   */
+  exc_grad_type eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) {
+    return eval_exc_grad_(Ps, Pz, ks_settings);
   }
 
   /** Integrate Exact Exchange for RHF
@@ -127,6 +151,50 @@ class XCIntegratorImpl {
     return eval_exx_(P,settings);
   }
 
+  
+  /** Integrate FXC contraction for RKS
+   * 
+   * @param[in] P the alpha density matrix
+   * @param[in] tP the alpha trial density matrix (contructed from purturbed MO coefficients)
+   * @returns FXC contraction
+   */
+  fxc_contraction_type_rks eval_fxc_contraction( const MatrixType& P, const MatrixType& tP, const IntegratorSettingsXC& ks_settings ) {
+    return eval_fxc_contraction_(P, tP, ks_settings);
+  }
+
+  /** Integrate FXC contraction for UKS
+   *
+   *  @param[in] Ps the scalar density matrix (Pa + Pb)
+   *  @param[in] Pz the Z density matrix (Pa - Pb)
+   *  @param[in] tPs the trial scalar density matrices (contructed from purturbed MO coefficients)
+   *  @param[in] tPz the trial Z density matrices      (contructed from purturbed MO coefficients)
+   *  @returns FXC contraction
+   */
+  fxc_contraction_type_uks eval_fxc_contraction( const MatrixType& Ps, const MatrixType& Pz, 
+    const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) {
+    return eval_fxc_contraction_(Ps, Pz, tPs, tPz, ks_settings);
+  }
+
+  /** Evaluate Psi vector for ddX
+   *
+   *  @param[in] P        The density matrix
+   *  @param[in] max_Ylm  The max "l" degree for Ylm
+   *  @returns   The atomic contributions to the SH projection of the density onto the DD domains
+   */   
+  dd_psi_type eval_dd_psi( const MatrixType& P, unsigned max_Ylm ) {
+    return eval_dd_psi_(P,max_Ylm);
+  }
+
+  /** Evaluate Psi Potential for ddX
+   *
+   *  @param[in] X        The local ASC coefficients, (nharmonics, atom) array in column-major ordering.
+   *  @param[in] max_Ylm  The max "l" degree for Ylm
+   *  @returns   fock contributions
+   */   
+  dd_psi_potential_type eval_dd_psi_potential( const MatrixType& X, unsigned max_Ylm ) {
+    return eval_dd_psi_potential_(X,max_Ylm);
+  }
+
   /** Get internal timers
    *
    *  @returns Timer instance for internal timings
diff --git a/include/gauxc/xc_integrator_settings.hpp b/include/gauxc/xc_integrator_settings.hpp
index dc90cc61..1ec26d0e 100644
--- a/include/gauxc/xc_integrator_settings.hpp
+++ b/include/gauxc/xc_integrator_settings.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -21,4 +25,8 @@ struct IntegratorSettingsKS : public IntegratorSettingsXC {
   double gks_dtol = 1e-12;
 };
 
+struct IntegratorSettingsEXC_GRAD : public IntegratorSettingsKS {
+  bool include_weight_derivatives= true; // whether to include grid weight contribution and employ translational invariance, or just use Hellmann-Feynman gradient
+};
+
 }
diff --git a/include/gauxc/xc_task.hpp b/include/gauxc/xc_task.hpp
index 1f70418f..630d6dd6 100644
--- a/include/gauxc/xc_task.hpp
+++ b/include/gauxc/xc_task.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7e51e9fa..27909d7f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/atomic_radii.cxx b/src/atomic_radii.cxx
index 6e3a829f..52275309 100644
--- a/src/atomic_radii.cxx
+++ b/src/atomic_radii.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -318,4 +322,31 @@ double clementi_radius_67(AtomicNumber _Z) {
 
 }
 
+// UFF atomic radii
+// Atomic radii derived from the universal force field
+// A. K. Rappe et. al. J. Am. Chem. Soc., 1992, 114 (25), pp 10024-10035
+// https://doi.org/10.1021/ja00051a040, data given in Angström,
+// will be converted to Bohr. Note that keys are normalised to lower case.
+const std::vector<double> radius_uff_list = {1.443, 1.81, 1.2255, 1.3725, 2.0415, 1.9255, 1.83, 1.75, 
+                            1.682, 1.6215, 1.4915, 1.5105, 
+                            2.2495, 2.1475, 2.0735, 2.0175, 1.9735, 1.934, 1.906, 1.6995, 1.6475, 
+                            1.5875, 1.572, 1.5115, 1.4805, 1.456, 1.436, 1.417, 1.7475, 
+                            1.3815, 2.1915, 2.14, 2.115, 2.1025, 2.0945, 2.0705, 2.057, 
+                            1.8205, 1.6725, 1.562, 1.5825, 1.526, 1.499, 1.4815, 1.4645, 
+                            1.4495, 1.574, 1.424, 2.2315, 2.196, 2.21, 2.235, 2.25, 2.202, 
+                            2.2585, 1.8515, 1.761, 1.778, 1.803, 1.7875, 1.7735, 1.76, 1.7465, 
+                            1.684, 1.7255, 1.714, 1.7045, 1.6955, 1.687, 1.6775, 1.82, 1.5705, 
+                            1.585, 1.5345, 1.477, 1.56, 1.42, 1.377, 1.6465, 1.3525, 2.1735, 2.1485, 
+                            2.185, 2.3545, 2.375, 2.3825, 2.45, 1.8385, 1.739, 1.698, 1.712, 1.6975, 
+                            1.712, 1.712, 1.6905, 1.663, 1.6695, 1.6565, 1.6495, 1.643, 1.637, 1.624, 1.618};
+
+double uff_radius_103(AtomicNumber _Z) {
+    const double RADIUS_UFF_SCALING = 1.1;
+    const double DDX_BOHR_TO_ANGSTROM = 0.52917721092;
+    auto Z = _Z.get();
+    if (Z < 0 || Z >= radius_uff_list.size()) {
+        return -1.;
+    }
+    return radius_uff_list[Z-1] * RADIUS_UFF_SCALING / DDX_BOHR_TO_ANGSTROM;
+}
 }
diff --git a/src/exceptions/cublas_exception.hpp b/src/exceptions/cublas_exception.hpp
index 84fc3c31..503fc900 100644
--- a/src/exceptions/cublas_exception.hpp
+++ b/src/exceptions/cublas_exception.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/exceptions/cuda_exception.hpp b/src/exceptions/cuda_exception.hpp
index 15ae3d3d..6d4767d1 100644
--- a/src/exceptions/cuda_exception.hpp
+++ b/src/exceptions/cuda_exception.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/exceptions/cutlass_exception.hpp b/src/exceptions/cutlass_exception.hpp
index 49a3192d..4de854be 100644
--- a/src/exceptions/cutlass_exception.hpp
+++ b/src/exceptions/cutlass_exception.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/exceptions/hip_exception.hpp b/src/exceptions/hip_exception.hpp
index 1fd22dbe..08a40302 100644
--- a/src/exceptions/hip_exception.hpp
+++ b/src/exceptions/hip_exception.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/exceptions/hipblas_exception.hpp b/src/exceptions/hipblas_exception.hpp
index 9bb39011..bb89a331 100644
--- a/src/exceptions/hipblas_exception.hpp
+++ b/src/exceptions/hipblas_exception.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/exceptions/magma_exception.hpp b/src/exceptions/magma_exception.hpp
index bb0e40ec..30056573 100644
--- a/src/exceptions/magma_exception.hpp
+++ b/src/exceptions/magma_exception.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/external/CMakeLists.txt b/src/external/CMakeLists.txt
index 3df13b30..fa1f7f37 100644
--- a/src/external/CMakeLists.txt
+++ b/src/external/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/external/hdf5_read.cxx b/src/external/hdf5_read.cxx
index cae9c865..c01424c4 100644
--- a/src/external/hdf5_read.cxx
+++ b/src/external/hdf5_read.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/external/hdf5_util.hpp b/src/external/hdf5_util.hpp
index 5c5cb696..9569734f 100644
--- a/src/external/hdf5_util.hpp
+++ b/src/external/hdf5_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/external/hdf5_write.cxx b/src/external/hdf5_write.cxx
index f782ef4a..cbf8bf04 100644
--- a/src/external/hdf5_write.cxx
+++ b/src/external/hdf5_write.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/grid.cxx b/src/grid.cxx
index 3f167270..fed7972f 100644
--- a/src/grid.cxx
+++ b/src/grid.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/grid_factory.cxx b/src/grid_factory.cxx
index 92cc5314..1836653e 100644
--- a/src/grid_factory.cxx
+++ b/src/grid_factory.cxx
@@ -1,16 +1,21 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #include <gauxc/grid_factory.hpp>
 
-#include <integratorxx/quadratures/lebedev_laikov.hpp>
-#include <integratorxx/quadratures/muraknowles.hpp>
-#include <integratorxx/quadratures/mhl.hpp>
-#include <integratorxx/quadratures/treutlerahlrichs.hpp>
+#include <integratorxx/quadratures/s2/lebedev_laikov.hpp>
+#include <integratorxx/quadratures/radial/muraknowles.hpp>
+#include <integratorxx/quadratures/radial/mhl.hpp>
+#include <integratorxx/quadratures/radial/treutlerahlrichs.hpp>
+#include <integratorxx/quadratures/radial/becke.hpp>
 #include <integratorxx/composite_quadratures/spherical_quadrature.hpp>
 #include <gauxc/exceptions.hpp>
 
@@ -30,6 +35,7 @@ Grid AtomicGridFactory::generate_grid( atomic_grid_variant gs, BatchSize bsz ) {
 Grid AtomicGridFactory::generate_unpruned_grid( RadialQuad rq, RadialSize nrad, 
   AngularSize nang, RadialScale rscal, BatchSize bsz) {
 
+  using bk_type  = IntegratorXX::Becke<double, double>;
   using mk_type  = IntegratorXX::MuraKnowles<double,double>;
   using mhl_type = IntegratorXX::MurrayHandyLaming<double,double>;
   using ta_type  = IntegratorXX::TreutlerAhlrichs<double,double>;
@@ -38,6 +44,9 @@ Grid AtomicGridFactory::generate_unpruned_grid( RadialQuad rq, RadialSize nrad,
   ll_type ang_quad( nang.get() );
 
   switch( rq ) {
+    case RadialQuad::Becke:
+      return generate_unpruned_grid( bk_type(nrad.get(), rscal.get()),
+        std::move(ang_quad), bsz );
 
     case RadialQuad::MuraKnowles:
       return generate_unpruned_grid( mk_type(nrad.get(), rscal.get()),
@@ -47,7 +56,7 @@ Grid AtomicGridFactory::generate_unpruned_grid( RadialQuad rq, RadialSize nrad,
       return generate_unpruned_grid( mhl_type(nrad.get(), rscal.get()),
         std::move(ang_quad), bsz );
 
-    case RadialQuad::TreutlerAldrichs:
+    case RadialQuad::TreutlerAhlrichs:
       return generate_unpruned_grid( ta_type(nrad.get(), rscal.get()),
         std::move(ang_quad), bsz );
 
@@ -113,12 +122,18 @@ Grid AtomicGridFactory::generate_pruned_grid( RadialQuad rq,
       return generate_pruned_grid(std::move(rg), std::move(rgp), bsz);
     }
 
-    case RadialQuad::TreutlerAldrichs:
+    case RadialQuad::TreutlerAhlrichs:
     {
       auto [rg, rgp] = 
         make_pruned_grid<ta_type>( nrad, pruning_regions, rscal );
       return generate_pruned_grid(std::move(rg), std::move(rgp), bsz);
     }
+    case RadialQuad::Becke:
+    {
+      auto[rg, rgp] = 
+        make_pruned_grid<IntegratorXX::Becke<double,double>>( nrad, pruning_regions, rscal );
+        return generate_pruned_grid(std::move(rg), std::move(rgp), bsz);
+    }
 
     default:
       GAUXC_GENERIC_EXCEPTION("Unsupported Radial Quadrature");
@@ -145,17 +160,18 @@ PrunedAtomicGridSpecification robust_psi4_pruning_scheme(
 
   // Look up order
   // XXX: THIS ONLY WORKS FOR LEBEDEV
-  using namespace IntegratorXX::detail::lebedev;
+  using angular_type = IntegratorXX::LebedevLaikov<double>; 
+  using traits = IntegratorXX::quadrature_traits<angular_type>;
   const auto asz = unp.angular_size.get();
-  const auto base_order = algebraic_order_by_npts(asz);
+  const auto base_order = traits::algebraic_order_by_npts(asz);
   if( base_order < 0 ) GAUXC_GENERIC_EXCEPTION("Invalid Base Grid");
 
   const auto med_order = 
-    next_algebraic_order(base_order > 6 ? base_order-6 : base_order);
+    traits::next_algebraic_order(base_order > 6 ? base_order-6 : base_order);
   const auto low_order = 7;
 
-  AngularSize med_sz(npts_by_algebraic_order(med_order));
-  AngularSize low_sz(npts_by_algebraic_order(low_order));
+  AngularSize med_sz(traits::npts_by_algebraic_order(med_order));
+  AngularSize low_sz(traits::npts_by_algebraic_order(low_order));
 
   // Create Pruning Regions
   const size_t rsz = unp.radial_size.get();
@@ -183,9 +199,11 @@ PrunedAtomicGridSpecification treutler_pruning_scheme(
 
   // Look up order
   // XXX: THIS ONLY WORKS FOR LEBEDEV
-  using namespace IntegratorXX::detail::lebedev;
-  AngularSize med_sz(npts_by_algebraic_order(med_order));
-  AngularSize low_sz(npts_by_algebraic_order(low_order));
+  using angular_type = IntegratorXX::LebedevLaikov<double>;
+  using traits = IntegratorXX::quadrature_traits<angular_type>;
+
+  AngularSize med_sz(traits::npts_by_algebraic_order(med_order));
+  AngularSize low_sz(traits::npts_by_algebraic_order(low_order));
 
   // Create Pruning Regions
   const size_t rsz = unp.radial_size.get();
diff --git a/src/grid_impl.cxx b/src/grid_impl.cxx
index 07bb4eaa..069dadba 100644
--- a/src/grid_impl.cxx
+++ b/src/grid_impl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/grid_impl.hpp b/src/grid_impl.hpp
index 3566d9f7..29b88c9f 100644
--- a/src/grid_impl.hpp
+++ b/src/grid_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/CMakeLists.txt b/src/load_balancer/CMakeLists.txt
index 457f2c6b..3dca6a6a 100644
--- a/src/load_balancer/CMakeLists.txt
+++ b/src/load_balancer/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/load_balancer/device/CMakeLists.txt b/src/load_balancer/device/CMakeLists.txt
index 6bd03e77..00b5dee2 100644
--- a/src/load_balancer/device/CMakeLists.txt
+++ b/src/load_balancer/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/load_balancer/device/cuda/CMakeLists.txt b/src/load_balancer/device/cuda/CMakeLists.txt
index 04632cf4..49e0d17c 100644
--- a/src/load_balancer/device/cuda/CMakeLists.txt
+++ b/src/load_balancer/device/cuda/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/load_balancer/device/cuda/cuda_collision_detection.cu b/src/load_balancer/device/cuda/cuda_collision_detection.cu
index 6292d67c..7da69c72 100644
--- a/src/load_balancer/device/cuda/cuda_collision_detection.cu
+++ b/src/load_balancer/device/cuda/cuda_collision_detection.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/cuda/cuda_collision_detection.hpp b/src/load_balancer/device/cuda/cuda_collision_detection.hpp
index 9b09712a..d7156496 100644
--- a/src/load_balancer/device/cuda/cuda_collision_detection.hpp
+++ b/src/load_balancer/device/cuda/cuda_collision_detection.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx
index 15d33900..af2199b7 100644
--- a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx
+++ b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp
index 7e0a0d65..585edde3 100644
--- a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp
+++ b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/hip/CMakeLists.txt b/src/load_balancer/device/hip/CMakeLists.txt
index e13db053..cd1e4aaf 100644
--- a/src/load_balancer/device/hip/CMakeLists.txt
+++ b/src/load_balancer/device/hip/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/load_balancer/device/hip/hip_collision_detection.hip b/src/load_balancer/device/hip/hip_collision_detection.hip
index 83ee78d6..89d0978f 100644
--- a/src/load_balancer/device/hip/hip_collision_detection.hip
+++ b/src/load_balancer/device/hip/hip_collision_detection.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/hip/hip_collision_detection.hpp b/src/load_balancer/device/hip/hip_collision_detection.hpp
index db73bdd8..a191d18f 100644
--- a/src/load_balancer/device/hip/hip_collision_detection.hpp
+++ b/src/load_balancer/device/hip/hip_collision_detection.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx b/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx
index c4475157..ac693e1e 100644
--- a/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx
+++ b/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp b/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp
index 7e0a0d65..585edde3 100644
--- a/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp
+++ b/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/load_balancer_device_factory.cxx b/src/load_balancer/device/load_balancer_device_factory.cxx
index 606a6b57..e481f4f2 100644
--- a/src/load_balancer/device/load_balancer_device_factory.cxx
+++ b/src/load_balancer/device/load_balancer_device_factory.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/device/load_balancer_device_factory.hpp b/src/load_balancer/device/load_balancer_device_factory.hpp
index c28535c2..f61a3f44 100644
--- a/src/load_balancer/device/load_balancer_device_factory.hpp
+++ b/src/load_balancer/device/load_balancer_device_factory.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/fillin_replicated_load_balancer.cxx b/src/load_balancer/host/fillin_replicated_load_balancer.cxx
index 04e2f908..a84d40d3 100644
--- a/src/load_balancer/host/fillin_replicated_load_balancer.cxx
+++ b/src/load_balancer/host/fillin_replicated_load_balancer.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/fillin_replicated_load_balancer.hpp b/src/load_balancer/host/fillin_replicated_load_balancer.hpp
index c6df5f94..eb40cefb 100644
--- a/src/load_balancer/host/fillin_replicated_load_balancer.hpp
+++ b/src/load_balancer/host/fillin_replicated_load_balancer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/load_balancer_host_factory.cxx b/src/load_balancer/host/load_balancer_host_factory.cxx
index 94db35de..f69d7fd9 100644
--- a/src/load_balancer/host/load_balancer_host_factory.cxx
+++ b/src/load_balancer/host/load_balancer_host_factory.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/load_balancer_host_factory.hpp b/src/load_balancer/host/load_balancer_host_factory.hpp
index d02e5c03..ae878678 100644
--- a/src/load_balancer/host/load_balancer_host_factory.hpp
+++ b/src/load_balancer/host/load_balancer_host_factory.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/petite_replicated_load_balancer.cxx b/src/load_balancer/host/petite_replicated_load_balancer.cxx
index 0bef2b83..3ecc53c1 100644
--- a/src/load_balancer/host/petite_replicated_load_balancer.cxx
+++ b/src/load_balancer/host/petite_replicated_load_balancer.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/petite_replicated_load_balancer.hpp b/src/load_balancer/host/petite_replicated_load_balancer.hpp
index 02b8a8bf..8c339699 100644
--- a/src/load_balancer/host/petite_replicated_load_balancer.hpp
+++ b/src/load_balancer/host/petite_replicated_load_balancer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/host/replicated_host_load_balancer.cxx b/src/load_balancer/host/replicated_host_load_balancer.cxx
index 5a3bb9c9..8f05f186 100644
--- a/src/load_balancer/host/replicated_host_load_balancer.cxx
+++ b/src/load_balancer/host/replicated_host_load_balancer.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -123,7 +127,7 @@ std::vector< XCTask > HostReplicatedLoadBalancer::create_local_tasks_() const  {
 
   } // Loop over Atoms
 
-//return local_work;
+// return local_work;
 
   // Lexicographic ordering of tasks
   auto task_order = []( const auto& a, const auto& b ) {
diff --git a/src/load_balancer/host/replicated_host_load_balancer.hpp b/src/load_balancer/host/replicated_host_load_balancer.hpp
index dfd8f319..9b4d0a08 100644
--- a/src/load_balancer/host/replicated_host_load_balancer.hpp
+++ b/src/load_balancer/host/replicated_host_load_balancer.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/load_balancer.cxx b/src/load_balancer/load_balancer.cxx
index 9329fef8..637e1d8b 100644
--- a/src/load_balancer/load_balancer.cxx
+++ b/src/load_balancer/load_balancer.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -51,6 +55,10 @@ const util::Timer& LoadBalancer::get_timings() const {
   return pimpl_->get_timings();
 }
 
+size_t LoadBalancer::total_npts() const {
+  if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
+  return pimpl_->total_npts();
+}
 size_t LoadBalancer::max_npts() const {
   if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED();
   return pimpl_->max_npts();
diff --git a/src/load_balancer/load_balancer_factory.cxx b/src/load_balancer/load_balancer_factory.cxx
index b14ddbee..bdc2898e 100644
--- a/src/load_balancer/load_balancer_factory.cxx
+++ b/src/load_balancer/load_balancer_factory.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/load_balancer/load_balancer_impl.cxx b/src/load_balancer/load_balancer_impl.cxx
index 06dbbd19..f6b853da 100644
--- a/src/load_balancer/load_balancer_impl.cxx
+++ b/src/load_balancer/load_balancer_impl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -59,6 +63,14 @@ const util::Timer& LoadBalancerImpl::get_timings() const {
 }
 
 
+size_t LoadBalancerImpl::total_npts() const {
+
+  return std::accumulate( local_tasks_.cbegin(), local_tasks_.cend(), 0ul,
+    []( const auto& a, const auto& b ) {
+      return a + b.points.size();
+    });
+
+}
 size_t LoadBalancerImpl::max_npts() const {
 
   if( not local_tasks_.size() ) return 0ul;
diff --git a/src/load_balancer/load_balancer_impl.hpp b/src/load_balancer/load_balancer_impl.hpp
index 566279a1..53c75865 100644
--- a/src/load_balancer/load_balancer_impl.hpp
+++ b/src/load_balancer/load_balancer_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -63,6 +67,7 @@ class LoadBalancerImpl {
 
   const util::Timer& get_timings() const;
 
+  size_t total_npts()     const;
   size_t max_npts()       const;
   size_t max_nbe()        const;
   size_t max_npts_x_nbe() const;
diff --git a/src/load_balancer/rebalance.cxx b/src/load_balancer/rebalance.cxx
index 91898b24..3879f199 100644
--- a/src/load_balancer/rebalance.cxx
+++ b/src/load_balancer/rebalance.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molecular_weights/CMakeLists.txt b/src/molecular_weights/CMakeLists.txt
index be0102c9..e9ce4e19 100644
--- a/src/molecular_weights/CMakeLists.txt
+++ b/src/molecular_weights/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/molecular_weights/device/CMakeLists.txt b/src/molecular_weights/device/CMakeLists.txt
index 89f2e279..15f5fe67 100644
--- a/src/molecular_weights/device/CMakeLists.txt
+++ b/src/molecular_weights/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/molecular_weights/device/device_molecular_weights.cxx b/src/molecular_weights/device/device_molecular_weights.cxx
index 42d81d7b..c5bcce5c 100644
--- a/src/molecular_weights/device/device_molecular_weights.cxx
+++ b/src/molecular_weights/device/device_molecular_weights.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -37,7 +41,7 @@ void DeviceMolecularWeights::modify_weights( LoadBalancer& lb ) const {
   auto task_comparator = []( const XCTask& a, const XCTask& b ) {
     return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
   };
-  std::sort(task_begin, task_end, task_comparator );
+  std::stable_sort(task_begin, task_end, task_comparator );
 
   const auto& mol  = lb.molecule();
   const auto natoms = mol.natoms();
@@ -79,6 +83,7 @@ void DeviceMolecularWeights::modify_weights( LoadBalancer& lb ) const {
   rt.device_backend()->master_queue_synchronize();
  
   lb.state().modified_weights_are_stored = true;
+  lb.state().weight_alg = this->settings_.weight_alg;
 
 }
 
diff --git a/src/molecular_weights/device/device_molecular_weights.hpp b/src/molecular_weights/device/device_molecular_weights.hpp
index 69c5da11..d4cd202d 100644
--- a/src/molecular_weights/device/device_molecular_weights.hpp
+++ b/src/molecular_weights/device/device_molecular_weights.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molecular_weights/host/CMakeLists.txt b/src/molecular_weights/host/CMakeLists.txt
index d4399b51..889f10e5 100644
--- a/src/molecular_weights/host/CMakeLists.txt
+++ b/src/molecular_weights/host/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/molecular_weights/host/host_molecular_weights.cxx b/src/molecular_weights/host/host_molecular_weights.cxx
index efe075fb..e722d22b 100644
--- a/src/molecular_weights/host/host_molecular_weights.cxx
+++ b/src/molecular_weights/host/host_molecular_weights.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -25,7 +29,7 @@ void HostMolecularWeights::modify_weights( LoadBalancer& lb ) const {
   auto task_comparator = []( const XCTask& a, const XCTask& b ) {
     return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
   };
-  std::sort( tasks.begin(), tasks.end(), task_comparator );
+  std::stable_sort( tasks.begin(), tasks.end(), task_comparator );
 
   // Modify the weights
   const auto& mol  = lb.molecule();
@@ -34,6 +38,7 @@ void HostMolecularWeights::modify_weights( LoadBalancer& lb ) const {
     tasks.begin(), tasks.end() );
 
   lb.state().modified_weights_are_stored = true;
+  lb.state().weight_alg = this->settings_.weight_alg;
 }
 
 }
diff --git a/src/molecular_weights/host/host_molecular_weights.hpp b/src/molecular_weights/host/host_molecular_weights.hpp
index 4ce87b0d..2a037951 100644
--- a/src/molecular_weights/host/host_molecular_weights.hpp
+++ b/src/molecular_weights/host/host_molecular_weights.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molecular_weights/molecular_weights.cxx b/src/molecular_weights/molecular_weights.cxx
index e0c4a660..d65ccd90 100644
--- a/src/molecular_weights/molecular_weights.cxx
+++ b/src/molecular_weights/molecular_weights.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molecular_weights/molecular_weights_impl.hpp b/src/molecular_weights/molecular_weights_impl.hpp
index ba838e34..7c4b5ed6 100644
--- a/src/molecular_weights/molecular_weights_impl.hpp
+++ b/src/molecular_weights/molecular_weights_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molgrid.cxx b/src/molgrid.cxx
index cc138b6b..f9ee6e51 100644
--- a/src/molgrid.cxx
+++ b/src/molgrid.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molgrid_defaults.cxx b/src/molgrid_defaults.cxx
index c1d0fc52..61b66830 100644
--- a/src/molgrid_defaults.cxx
+++ b/src/molgrid_defaults.cxx
@@ -1,13 +1,17 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #include <gauxc/molgrid/defaults.hpp>
 #include <gauxc/exceptions.hpp>
-#include <integratorxx/quadratures/lebedev_laikov.hpp>
+#include <integratorxx/quadratures/s2/lebedev_laikov.hpp>
 
 namespace GauXC {
 
@@ -83,11 +87,19 @@ RadialScale default_mhl_radial_scaling_factor( AtomicNumber _Z ) {
   return RadialScale( default_atomic_radius(_Z) * fac );
 }
 
+RadialScale default_bk_radial_scaling_factor( AtomicNumber _Z ) {
+  auto Z = _Z.get(); 
+  const double fac = (Z!=1) ? 0.5 : 1.0;
+  return RadialScale( default_atomic_radius(_Z) * fac );
+}
+
 RadialScale default_radial_scaling_factor(RadialQuad rq, AtomicNumber Z) {
   if( rq == RadialQuad::MuraKnowles ) 
     return default_mk_radial_scaling_factor(Z);
-  else if( rq == RadialQuad::TreutlerAldrichs )
+  else if( rq == RadialQuad::TreutlerAhlrichs )
     return default_ta_radial_scaling_factor(Z);
+  else if( rq == RadialQuad::Becke )
+    return default_bk_radial_scaling_factor(Z);
   else // MHL
     return default_mhl_radial_scaling_factor(Z);
 }
diff --git a/src/molgrid_impl.cxx b/src/molgrid_impl.cxx
index 188f6473..c6939e3e 100644
--- a/src/molgrid_impl.cxx
+++ b/src/molgrid_impl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molgrid_impl.hpp b/src/molgrid_impl.hpp
index 5455f21a..e8c0590e 100644
--- a/src/molgrid_impl.hpp
+++ b/src/molgrid_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/molmeta.cxx b/src/molmeta.cxx
index fc770a9d..3bad9987 100644
--- a/src/molmeta.cxx
+++ b/src/molmeta.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/CMakeLists.txt b/src/reduction_driver/CMakeLists.txt
index c991ad18..07ca9494 100644
--- a/src/reduction_driver/CMakeLists.txt
+++ b/src/reduction_driver/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/reduction_driver/device/CMakeLists.txt b/src/reduction_driver/device/CMakeLists.txt
index 68f55c86..95d6575d 100644
--- a/src/reduction_driver/device/CMakeLists.txt
+++ b/src/reduction_driver/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/reduction_driver/device/device_reduction_driver.cxx b/src/reduction_driver/device/device_reduction_driver.cxx
index ea014857..2395e722 100644
--- a/src/reduction_driver/device/device_reduction_driver.cxx
+++ b/src/reduction_driver/device/device_reduction_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/device/device_reduction_driver.hpp b/src/reduction_driver/device/device_reduction_driver.hpp
index 5b63615e..b26dafe6 100644
--- a/src/reduction_driver/device/device_reduction_driver.hpp
+++ b/src/reduction_driver/device/device_reduction_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/device/nccl_reduction_driver.cxx b/src/reduction_driver/device/nccl_reduction_driver.cxx
index 20848f4a..7c314805 100644
--- a/src/reduction_driver/device/nccl_reduction_driver.cxx
+++ b/src/reduction_driver/device/nccl_reduction_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/device/nccl_reduction_driver.hpp b/src/reduction_driver/device/nccl_reduction_driver.hpp
index 9db4e40f..529c6c9a 100644
--- a/src/reduction_driver/device/nccl_reduction_driver.hpp
+++ b/src/reduction_driver/device/nccl_reduction_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/host/CMakeLists.txt b/src/reduction_driver/host/CMakeLists.txt
index 52729332..a3bc5fe8 100644
--- a/src/reduction_driver/host/CMakeLists.txt
+++ b/src/reduction_driver/host/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
index a8de7975..904f7caf 100644
--- a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
+++ b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.hpp b/src/reduction_driver/host/basic_mpi_reduction_driver.hpp
index 7c3b231f..8172edc8 100644
--- a/src/reduction_driver/host/basic_mpi_reduction_driver.hpp
+++ b/src/reduction_driver/host/basic_mpi_reduction_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/host/host_reduction_driver.cxx b/src/reduction_driver/host/host_reduction_driver.cxx
index f6accde8..fe288602 100644
--- a/src/reduction_driver/host/host_reduction_driver.cxx
+++ b/src/reduction_driver/host/host_reduction_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/host/host_reduction_driver.hpp b/src/reduction_driver/host/host_reduction_driver.hpp
index 382d62e2..fe661de6 100644
--- a/src/reduction_driver/host/host_reduction_driver.hpp
+++ b/src/reduction_driver/host/host_reduction_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/reduction_driver.cxx b/src/reduction_driver/reduction_driver.cxx
index bc09d6a2..26a57415 100644
--- a/src/reduction_driver/reduction_driver.cxx
+++ b/src/reduction_driver/reduction_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/reduction_driver_factory.cxx b/src/reduction_driver/reduction_driver_factory.cxx
index d11a492c..8b3d5f34 100644
--- a/src/reduction_driver/reduction_driver_factory.cxx
+++ b/src/reduction_driver/reduction_driver_factory.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/reduction_driver_impl.cxx b/src/reduction_driver/reduction_driver_impl.cxx
index 4f4e24c6..96351265 100644
--- a/src/reduction_driver/reduction_driver_impl.cxx
+++ b/src/reduction_driver/reduction_driver_impl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/reduction_driver/reduction_driver_impl.hpp b/src/reduction_driver/reduction_driver_impl.hpp
index d5f8bafb..e4980a9a 100644
--- a/src/reduction_driver/reduction_driver_impl.hpp
+++ b/src/reduction_driver/reduction_driver_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/CMakeLists.txt b/src/runtime_environment/CMakeLists.txt
index dd43e5c8..2ef10327 100644
--- a/src/runtime_environment/CMakeLists.txt
+++ b/src/runtime_environment/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/runtime_environment/device/CMakeLists.txt b/src/runtime_environment/device/CMakeLists.txt
index 360783db..151a5889 100644
--- a/src/runtime_environment/device/CMakeLists.txt
+++ b/src/runtime_environment/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/runtime_environment/device/cuda/CMakeLists.txt b/src/runtime_environment/device/cuda/CMakeLists.txt
index f397a2ca..50ea945c 100644
--- a/src/runtime_environment/device/cuda/CMakeLists.txt
+++ b/src/runtime_environment/device/cuda/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/runtime_environment/device/cuda/cuda_backend.cxx b/src/runtime_environment/device/cuda/cuda_backend.cxx
index 84bd457f..610f33f4 100644
--- a/src/runtime_environment/device/cuda/cuda_backend.cxx
+++ b/src/runtime_environment/device/cuda/cuda_backend.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device/cuda/cuda_backend.hpp b/src/runtime_environment/device/cuda/cuda_backend.hpp
index 6401803b..8e47a6a9 100644
--- a/src/runtime_environment/device/cuda/cuda_backend.hpp
+++ b/src/runtime_environment/device/cuda/cuda_backend.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device/device_backend.hpp b/src/runtime_environment/device/device_backend.hpp
index 5c10bf90..594b7988 100644
--- a/src/runtime_environment/device/device_backend.hpp
+++ b/src/runtime_environment/device/device_backend.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device/device_blas_handle.hpp b/src/runtime_environment/device/device_blas_handle.hpp
index e0faaea7..76368f34 100644
--- a/src/runtime_environment/device/device_blas_handle.hpp
+++ b/src/runtime_environment/device/device_blas_handle.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device/device_queue.hpp b/src/runtime_environment/device/device_queue.hpp
index 9fdbacff..51eba1c5 100644
--- a/src/runtime_environment/device/device_queue.hpp
+++ b/src/runtime_environment/device/device_queue.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device/device_runtime_environment.cxx b/src/runtime_environment/device/device_runtime_environment.cxx
index 41d841b8..88998bd9 100644
--- a/src/runtime_environment/device/device_runtime_environment.cxx
+++ b/src/runtime_environment/device/device_runtime_environment.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -67,5 +71,11 @@ DeviceBackend* DeviceRuntimeEnvironment::device_backend() const {
 bool DeviceRuntimeEnvironment::owns_memory() const {
   return device_runtime_pimpl_cast(pimpl_.get())->owns_memory();
 }
+void DeviceRuntimeEnvironment::release_buffer() {
+  device_runtime_pimpl_cast(pimpl_.get())->release_buffer();
+}
+void DeviceRuntimeEnvironment::set_buffer(void* p, size_t sz) {
+  device_runtime_pimpl_cast(pimpl_.get())->set_buffer(p, sz);
+}
 
 }
diff --git a/src/runtime_environment/device/device_runtime_environment_impl.hpp b/src/runtime_environment/device/device_runtime_environment_impl.hpp
index 6489e11c..9831c5c2 100644
--- a/src/runtime_environment/device/device_runtime_environment_impl.hpp
+++ b/src/runtime_environment/device/device_runtime_environment_impl.hpp
@@ -1,13 +1,18 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #pragma once
 #include "../runtime_environment_impl.hpp"
 #include "device_backend.hpp"
+#include <gauxc/exceptions.hpp>
 
 namespace GauXC::detail {
 
@@ -63,6 +68,23 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl {
   inline size_t device_memory_size() const { return device_memory_size_; }
   inline bool owns_memory() const { return i_own_this_memory_; }
 
+  inline void release_buffer() {
+    if(i_own_this_memory_ and device_memory_ and device_memory_size_) {
+      device_backend_->free_device_buffer(device_memory_);
+    } else {
+      GAUXC_GENERIC_EXCEPTION("GauXC Cannot Release A Buffer It Does Not Own");
+    }
+  }
+
+  inline void set_buffer(void* p, size_t sz) {
+    if(owns_memory()) {
+      release_buffer();
+      i_own_this_memory_ = false;
+    }
+
+    device_memory_ = p;
+    device_memory_size_ = sz;
+  }
 };
 
 
diff --git a/src/runtime_environment/device/hip/CMakeLists.txt b/src/runtime_environment/device/hip/CMakeLists.txt
index df97901b..5fd50fc9 100644
--- a/src/runtime_environment/device/hip/CMakeLists.txt
+++ b/src/runtime_environment/device/hip/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/runtime_environment/device/hip/hip_backend.cxx b/src/runtime_environment/device/hip/hip_backend.cxx
index 48c6732a..69c3fd28 100644
--- a/src/runtime_environment/device/hip/hip_backend.cxx
+++ b/src/runtime_environment/device/hip/hip_backend.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device/hip/hip_backend.hpp b/src/runtime_environment/device/hip/hip_backend.hpp
index 38919bec..6b90063f 100644
--- a/src/runtime_environment/device/hip/hip_backend.hpp
+++ b/src/runtime_environment/device/hip/hip_backend.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/cublas_util.hpp b/src/runtime_environment/device_specific/cublas_util.hpp
index 1955740d..10fa35be 100644
--- a/src/runtime_environment/device_specific/cublas_util.hpp
+++ b/src/runtime_environment/device_specific/cublas_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/cuda_device_constants.hpp b/src/runtime_environment/device_specific/cuda_device_constants.hpp
index 53c86ac5..3b4ac8e4 100644
--- a/src/runtime_environment/device_specific/cuda_device_constants.hpp
+++ b/src/runtime_environment/device_specific/cuda_device_constants.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/cuda_util.hpp b/src/runtime_environment/device_specific/cuda_util.hpp
index d5c632ef..7d133e5d 100644
--- a/src/runtime_environment/device_specific/cuda_util.hpp
+++ b/src/runtime_environment/device_specific/cuda_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/hip_device_constants.hpp b/src/runtime_environment/device_specific/hip_device_constants.hpp
index 3a79fdf3..38ff3878 100644
--- a/src/runtime_environment/device_specific/hip_device_constants.hpp
+++ b/src/runtime_environment/device_specific/hip_device_constants.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/hip_util.hpp b/src/runtime_environment/device_specific/hip_util.hpp
index 797c5b8f..61ea9e0c 100644
--- a/src/runtime_environment/device_specific/hip_util.hpp
+++ b/src/runtime_environment/device_specific/hip_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/hipblas_util.hpp b/src/runtime_environment/device_specific/hipblas_util.hpp
index 38351ee1..d9b324df 100644
--- a/src/runtime_environment/device_specific/hipblas_util.hpp
+++ b/src/runtime_environment/device_specific/hipblas_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/magma_util.hpp b/src/runtime_environment/device_specific/magma_util.hpp
index 7cf79369..ca4f4f17 100644
--- a/src/runtime_environment/device_specific/magma_util.hpp
+++ b/src/runtime_environment/device_specific/magma_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/device_specific/nccl_util.hpp b/src/runtime_environment/device_specific/nccl_util.hpp
index 78938a0b..f4b87839 100644
--- a/src/runtime_environment/device_specific/nccl_util.hpp
+++ b/src/runtime_environment/device_specific/nccl_util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/runtime_environment.cxx b/src/runtime_environment/runtime_environment.cxx
index 83d9e365..21d3910e 100644
--- a/src/runtime_environment/runtime_environment.cxx
+++ b/src/runtime_environment/runtime_environment.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/runtime_environment/runtime_environment_impl.hpp b/src/runtime_environment/runtime_environment_impl.hpp
index a68129f8..6afa888f 100644
--- a/src/runtime_environment/runtime_environment_impl.hpp
+++ b/src/runtime_environment/runtime_environment_impl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/CMakeLists.txt b/src/xc_integrator/CMakeLists.txt
index c8adc6ad..9bc36d25 100644
--- a/src/xc_integrator/CMakeLists.txt
+++ b/src/xc_integrator/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/integrator_util/CMakeLists.txt b/src/xc_integrator/integrator_util/CMakeLists.txt
index 92b91a52..0a0edbe8 100644
--- a/src/xc_integrator/integrator_util/CMakeLists.txt
+++ b/src/xc_integrator/integrator_util/CMakeLists.txt
@@ -1,8 +1,12 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
-target_sources( gauxc PRIVATE integrator_common.cxx integral_bounds.cxx exx_screening.cxx )
+target_sources( gauxc PRIVATE integrator_common.cxx integral_bounds.cxx exx_screening.cxx spherical_harmonics.cxx )
diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx
index ccd716eb..5c7efcd1 100644
--- a/src/xc_integrator/integrator_util/exx_screening.cxx
+++ b/src/xc_integrator/integrator_util/exx_screening.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/integrator_util/exx_screening.hpp b/src/xc_integrator/integrator_util/exx_screening.hpp
index 176301ca..5c55c3a4 100644
--- a/src/xc_integrator/integrator_util/exx_screening.hpp
+++ b/src/xc_integrator/integrator_util/exx_screening.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/integrator_util/integral_bounds.cxx b/src/xc_integrator/integrator_util/integral_bounds.cxx
index e78adb05..680c3538 100644
--- a/src/xc_integrator/integrator_util/integral_bounds.cxx
+++ b/src/xc_integrator/integrator_util/integral_bounds.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/integrator_util/integral_bounds.hpp b/src/xc_integrator/integrator_util/integral_bounds.hpp
index 1f1e055a..02c6cae5 100644
--- a/src/xc_integrator/integrator_util/integral_bounds.hpp
+++ b/src/xc_integrator/integrator_util/integral_bounds.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/integrator_util/integrator_common.cxx b/src/xc_integrator/integrator_util/integrator_common.cxx
index b2f2a359..e919d917 100644
--- a/src/xc_integrator/integrator_util/integrator_common.cxx
+++ b/src/xc_integrator/integrator_util/integrator_common.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/integrator_util/integrator_common.hpp b/src/xc_integrator/integrator_util/integrator_common.hpp
index 67e4bab9..079e3d60 100644
--- a/src/xc_integrator/integrator_util/integrator_common.hpp
+++ b/src/xc_integrator/integrator_util/integrator_common.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/integrator_util/spherical_harmonics.cxx b/src/xc_integrator/integrator_util/spherical_harmonics.cxx
new file mode 100644
index 00000000..be7cf84a
--- /dev/null
+++ b/src/xc_integrator/integrator_util/spherical_harmonics.cxx
@@ -0,0 +1,166 @@
+#include "spherical_harmonics.hpp"
+// Computes the normalization constants N(l,m) for spherical harmonics up to degree lmax
+// N(l,m) = sqrt((2l + 1) / (4π) * ( (l - m)! / (l + m)! ) )
+// for m = 0, N(l,0) = sqrt(4π / (2l + 1))
+// for m > 0, N(l,m) = -N(l,m-1) / sqrt((l - m + 1) * (l + m))
+std::vector<double> sph_nlm(const int lmax) {
+    std::vector<double> nlm((lmax + 1) * (lmax + 1), 0.0);
+    for (int l = 0; l <= lmax; ++l) {
+        // For m = 0
+        int ind = l*l+l;
+        double tmp = std::sqrt( 4.0 * M_PI / (2 * l + 1) );
+        nlm[ind] = 1 / tmp;
+        // For m != 0
+        tmp = nlm[ind] * std::sqrt(2.0);
+        for (int m = 1; m <= l; ++m) {
+            tmp = -tmp / std::sqrt(static_cast<double>((l - m + 1) * (l + m)));
+            nlm[ind + m ] = tmp;
+        }
+    }
+    return nlm;
+}
+
+// Computes associated Legendre polynomials P_l^m(cos(theta)) up to degree lmax
+// // Input:
+// // - cos_theta: cos(theta), where -1 <= cos_theta <= 1
+// // - sin_theta: sin(theta), where 0 <= sin_theta <= 1
+// // - lmax: maximum degree of the polynomials to compute, lmax >= 0
+// // Output:
+// // - Returns a vector with values of associated Legendre polynomials, flattened to 1D with size (lmax+1)*(lmax+1)
+std::vector<double> sph_plm (const double cos_theta, const double sin_theta, const int lmax) {
+    std::vector<double> plms((lmax + 1) * (lmax + 1), 0.0);
+    
+    // Base cases
+    plms[0] = 1.0;  // P_0^0 = 1
+    if (lmax == 0) return plms;
+
+    plms[2] = cos_theta;   // P_1^0 = cos(theta)
+    plms[3] = -sin_theta;  // P_1^1 = -sin(theta)
+    if (lmax == 1) return plms;
+
+    double cos_theta2 = cos_theta * cos_theta;
+    plms[6] = 1.5 * cos_theta2 - 0.5; // P_2^0 (cos(theta)) = 1.5 * cos^2(theta) - 0.5, idx = 2*2 + 2 + 0 = 6
+    plms[7] = -3 * sin_theta * cos_theta; // P_2^1 (cos(theta)) = -3 * sin(theta) * cos(theta)
+    plms[8] = 3 * sin_theta * sin_theta; // P_2^2 (cos(theta)) = -3 * sin^2(theta)
+    if (lmax == 2) return plms;
+    
+    plms[12] = 2.5 * cos_theta2 * cos_theta - 1.5 * cos_theta; // P_3^0 (cos(theta)) = 2.5 * cos^3(theta) - 1.5 * cos(theta)
+    plms[13] = -7.5 * cos_theta2 * sin_theta + 1.5 * sin_theta ; // P_3^1 (cos(theta)) = -7.5 * cos^2(theta) * sin(theta) + 1.5 * sin(theta)
+    plms[14] = -5.0 * sin_theta * plms[7]; // P_3^2 (cos(theta)) = -5.0 * sin(theta) * P_2^1 (cos(theta))
+    plms[15] = -5.0 * sin_theta * plms[8]; // P_3^3 (cos(theta)) = -5.0 * sin(theta) * P_2^2 (cos(theta))
+    if (lmax == 3) return plms;
+    // Recurrence calculation for larger p
+    for (int l = 4; l <= lmax; ++l) {
+        double work = (2.0 * l - 1) * cos_theta;
+        for (int m = 0; m < l; ++m) {
+            int ind = l * l + l + m;
+            int pl1m_ind = (l - 1) * (l - 1) + l - 1 + m;
+            int pl2m_ind = (l - 2) * (l - 2) + l - 2 + m;
+            plms[ind] = (work * plms[pl1m_ind] - (l + m - 1) * plms[pl2m_ind]) / (l - m);
+        }
+        // Special case for m = l, P_m^m = -sin_theta * (2*m+1) * P_{m-1}^{m-1}
+        plms[(l+1)*(l+1) - 1] = -sin_theta * (2 * (l - 1) + 1) * plms[l*l-1];
+    }
+    return plms;
+}
+
+// Computes spherical harmonics Y_l^m(theta, phi) = N(l,m) P_l^m(cos(theta)) e^(imphi)
+// up to degree lmax at point x, with scaling factors nlm
+// - Returns a vector with size (lmax+1)*(lmax+1)
+void sph_legendre(const int lmax, const std::array<double, 3> x, const std::vector<double>& nlm, double* ylms) {
+    assert(x.size() == 3);
+    double rho = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]);
+    if (rho == 0.0) {
+        return;
+    }
+    double sin_theta = sqrt(x[0] * x[0] + x[1] * x[1]) / rho; // sin(theta) = r_xy/rho
+    if (sin_theta != 0.0) {
+        double cos_theta = x[2] / rho;
+        std::vector<double> plm = sph_plm(cos_theta, sin_theta, lmax);
+        for (int l = 0; l <= lmax; l++) {
+            int ind = l * l + l;
+            ylms[ind] = plm[ind] * nlm[ind]; // m = 0 implicitly uses `vcos(1) = 1`
+            for (int m = 1; m <= l; ++m) {
+                ylms[ind + m] = plm[ind + m] * nlm[ind + m];
+                ylms[ind - m] = ylms[ind + m];
+            }
+        }
+    } else {
+        // x = 0, y = 0, z != 0
+        double cos_theta = (x[2] > 0.0) ? 1.0 : -1.0;
+        for (int l = 0; l <= lmax; l ++) {
+            int ind = l * l + l;
+            ylms[ind] = nlm[ind];
+            if (l % 2 != 0) {
+                ylms[ind] *= cos_theta;
+            }
+        }
+    }
+}
+
+// compute scaled spherical harmonics, with precomputed normalization factors
+//    4π     |x - a|^l
+//  ------  ----------- Y_l^m(|x - a|)
+//  2l + 1       r^l
+void scaled_ylm_new(const int lmax, const std::array<double, 3> x, const std::array<double, 3> a, const double r, const std::vector<double>& nlm, double* ylm) {
+    std::array<double, 3> delta = {x[0] - a[0], x[1] - a[1], x[2] - a[2]};
+    double dnorm = sqrt(delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2]);
+    assert(dnorm != 0.0);   
+    std::array<double, 3> delta_norm = {delta[0] / dnorm, delta[1] / dnorm, delta[2] / dnorm};
+    double phi = atan2(delta_norm[1], delta_norm[0]);
+    sph_legendre(lmax, delta_norm, nlm, ylm);
+    for (int l = 0; l <= lmax; l++) {
+        double ratio = pow(dnorm / r, l) * 4.0 * M_PI / (2 * l + 1);
+        for (int m = -l; m <= l; m++) {
+            int ind = l * l + l + m;
+            if (m == 0) {
+              ylm[ind] *= ratio;
+            } else if (m < 0) {
+              ylm[ind] *= - ratio * sin(m * phi);
+            } else {
+              ylm[ind] *= ratio * cos(m * phi);
+            }
+        }
+    }
+}
+
+// compute scaled spherical harmonics, with standard library functions
+std::vector<double> scaled_ylm_std(int lmax, std::array<double, 3> x, std::array<double, 3> a, double r) {
+
+    std::vector<double> delta = {x[0] - a[0], x[1] - a[1], x[2] - a[2]};
+    double dnorm = sqrt(delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2]);
+    assert(dnorm != 0.0);
+    std::vector<double> delta_norm = {delta[0] / dnorm, delta[1] / dnorm, delta[2] / dnorm};
+
+    double rho = sqrt(delta_norm[0] * delta_norm[0] + delta_norm[1] * delta_norm[1] + delta_norm[2] * delta_norm[2]);
+    double theta = acos(delta_norm[2] / rho);
+    double phi = atan2(delta_norm[1], delta_norm[0]);
+
+    std::vector<double> ylm((lmax + 1) * (lmax + 1), 0.0);
+    for (int l = 0; l <= lmax; l++) {
+        double ratio = pow(dnorm / r, l) * 4.0 * M_PI / (2 * l + 1);
+        for (int m = 0; m <= l; m++) {
+            double sph = std::sph_legendre(l, m, theta) * ratio;
+            if (m == 0) {
+              ylm[l * l + l] = sph;
+            } else {
+              if (m % 2 != 0) {
+                  sph *= -1;
+              }
+              sph *= sqrt(2.0);
+              ylm[l * l + l - m ] = sph * sin(m * phi);
+              ylm[l * l + l + m ] = sph * cos(m * phi);
+            }
+        }
+    }
+    return ylm;
+}
+
+void scaled_ylm_matrix(const int lmax, const double* points, const int32_t  npts, const std::array<double, 3> center, const double radius, double* ylm_matrix) {
+  int nharmonics = (lmax + 1) * (lmax + 1);
+  auto nlm = sph_nlm(lmax);
+  for (int i = 0; i < npts; ++i) {
+    const std::array<double, 3> x = {points[3 * i], points[3 * i + 1], points[3 * i + 2]};
+    scaled_ylm_new(lmax, x, center, radius, nlm, ylm_matrix + i * nharmonics);
+  }
+}
\ No newline at end of file
diff --git a/src/xc_integrator/integrator_util/spherical_harmonics.hpp b/src/xc_integrator/integrator_util/spherical_harmonics.hpp
new file mode 100644
index 00000000..7ce495d8
--- /dev/null
+++ b/src/xc_integrator/integrator_util/spherical_harmonics.hpp
@@ -0,0 +1,7 @@
+#include <cmath>
+#include <vector>
+#include <array>
+#include <cassert>
+
+
+void scaled_ylm_matrix(const int lmax, const double* points, const int32_t  npts, const std::array<double, 3> center, const double radius, double* ylm_matrix);
\ No newline at end of file
diff --git a/src/xc_integrator/local_work_driver/CMakeLists.txt b/src/xc_integrator/local_work_driver/CMakeLists.txt
index 4705cfab..e0e5385f 100644
--- a/src/xc_integrator/local_work_driver/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/common/integrator_constants.hpp b/src/xc_integrator/local_work_driver/common/integrator_constants.hpp
index c86e2438..6229db7d 100644
--- a/src/xc_integrator/local_work_driver/common/integrator_constants.hpp
+++ b/src/xc_integrator/local_work_driver/common/integrator_constants.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -13,7 +17,7 @@ namespace integrator {
 template <typename F = double>
 constexpr F magic_ssf_factor = 0.64;
 
-constexpr double ssf_weight_tol = 1e-10;
+constexpr double ssf_weight_tol = 1e-13;
 
 }
 }
diff --git a/src/xc_integrator/local_work_driver/device/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/CMakeLists.txt
index 3c0fd8b6..0911dff2 100644
--- a/src/xc_integrator/local_work_driver/device/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp b/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp
index 92144ad0..ef705d0c 100644
--- a/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -95,4 +99,10 @@ void eval_collocation_shell_to_task_laplacian(
   XCDeviceTask*               device_tasks,
   device_queue           queue );
 
+void eval_collocation_shell_to_task_lapgrad(
+  uint32_t                    max_l,
+  AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
+  XCDeviceTask*               device_tasks,
+  device_queue           queue );
+
 } // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
index e590bb90..dc1f0d8f 100644
--- a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -20,6 +24,10 @@ enum class DeviceBlasUplo : unsigned char {
   Lower
 };
 
+template <typename T>
+void increment( device_blas_handle generic_handle, const T* X, T* Y, int N );
+
+
 template <typename T>
 void dot( device_blas_handle handle,
           int            N,
diff --git a/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp b/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp
index 05e925b6..cb069a70 100644
--- a/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp b/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp
index e278e23c..c7c84b3a 100644
--- a/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp b/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp
index be924bb8..7f78ebc8 100644
--- a/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp
@@ -1,22 +1,29 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #pragma once
 #include <gauxc/shell.hpp>
 #include "device/xc_device_task.hpp"
+#include "device/xc_device_data.hpp"
 #include "device/device_queue.hpp"
 #include "shell_to_task.hpp"
 
 namespace GauXC {
 
-void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue );
-void increment_exc_grad_gga( size_t nshell, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue );
+void increment_exc_grad_lda( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task,
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue );
+void increment_exc_grad_gga( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task,
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue );
+void increment_exc_grad_mgga( integrator_ks_scheme ks_scheme, size_t nshell, bool need_lapl, ShellToTaskDevice* shell_to_task,
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue );
 
 }
 
diff --git a/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp b/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp
index 9a978f55..8664c4b2 100644
--- a/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp b/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp
index 8902da75..28517b0b 100644
--- a/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp b/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp
index 63020fee..38a0fb03 100644
--- a/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp b/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp
index 1e06e5fc..c26059f3 100644
--- a/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp
index 78ce5324..25057228 100644
--- a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,34 @@ namespace GauXC {
 
 void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
   XCDeviceTask* device_tasks, device_queue queue );
-
 void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
   XCDeviceTask* device_tasks, device_queue queue );
+void eval_uvars_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue );
+
+
+void eval_vvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue );
+void eval_vvars_gga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue );
+void eval_vvars_mgga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue );
+
+  
+
+void eval_tmat_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue );
+void eval_tmat_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue );
+void eval_tmat_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue );
 
-void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
-  int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, 
-  device_queue queue );
 
-void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select,
+void eval_vvars_lda_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue );
+void eval_vvars_gga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
   XCDeviceTask* device_tasks, device_queue queue );
+void eval_vvars_mgga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue );
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp b/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp
index 99f1c99a..18b189f3 100644
--- a/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -21,4 +25,16 @@ void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts,
   double* eps, double* vrho, double* vgamma, double* vtau, double* vlapl,
   device_queue queue );
 
+void eval_kern_vxc_fxc_lda( const functional_type& func, size_t npts,
+  const double* rho, double* vrho, double* v2rho2, device_queue queue );
+void eval_kern_vxc_fxc_gga( const functional_type& func, size_t npts,
+  const double* rho, const double* gamma, double* vrho, double* vgamma,
+  double* v2rho2, double* v2rhogamma, double* v2gamma2, device_queue queue );
+void eval_kern_vxc_fxc_mgga( const functional_type& func, size_t npts,
+  const double* rho, const double* gamma, const double* lapl, const double* tau,
+  double* vrho, double* vgamma, double* vlapl, double* vtau,
+  double* v2rho2, double* v2rhogamma, double* v2rholapl, double* v2rhotau,
+  double* v2gamma2, double* v2gammalapl, double* v2gammatau, double* v2lapl2,
+  double* v2lapltau, double* v2tau2, device_queue queue );
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/common/zmat_fxc.hpp b/src/xc_integrator/local_work_driver/device/common/zmat_fxc.hpp
new file mode 100644
index 00000000..739afc81
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/common/zmat_fxc.hpp
@@ -0,0 +1,47 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include "device/xc_device_task.hpp"
+#include "device/xc_device_data.hpp"
+#include "device/device_queue.hpp"
+
+namespace GauXC {
+
+void zmat_lda_fxc( size_t        ntasks,
+                   int32_t       max_nbf,
+                   int32_t       max_npts,
+                   XCDeviceTask* tasks_device,
+                   density_id sel,
+                   device_queue queue );
+
+void zmat_gga_fxc( size_t        ntasks,
+                   int32_t       max_nbf,
+                   int32_t       max_npts,
+                   XCDeviceTask* tasks_device,
+                   density_id sel,
+                   device_queue queue );
+
+void zmat_mgga_fxc( size_t        ntasks,
+                    int32_t       max_nbf,
+                    int32_t       max_npts,
+                    XCDeviceTask* tasks_device,
+                    bool          do_lapl,
+                    density_id sel,
+                    device_queue queue );
+
+void mmat_mgga_fxc( size_t        ntasks,
+                    int32_t       max_nbf,
+                    int32_t       max_npts,
+                    XCDeviceTask* tasks_device,
+                    bool          do_lapl,
+                    density_id sel,
+                    device_queue queue );
+}
diff --git a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp
index 514b6d38..3c48967f 100644
--- a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp
+++ b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -32,6 +36,8 @@ void zmat_mgga_vxc( size_t        ntasks,
                     int32_t       max_npts,
                     XCDeviceTask* tasks_device,
                     bool          do_lapl,
+                    integrator_ks_scheme s,
+                    density_id sel,
                     device_queue queue );
 
 void mmat_mgga_vxc( size_t        ntasks,
@@ -39,5 +45,7 @@ void mmat_mgga_vxc( size_t        ntasks,
                     int32_t       max_npts,
                     XCDeviceTask* tasks_device,
                     bool          do_lapl,
+                    integrator_ks_scheme s,
+                    density_id sel,
                     device_queue queue );
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt
index 7320d3c1..8c0608f7 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
@@ -20,6 +24,7 @@ target_sources(gauxc PRIVATE
   kernels/cublas_extensions.cu
   kernels/uvvars.cu
   kernels/zmat_vxc.cu
+  kernels/zmat_fxc.cu
   kernels/cuda_inc_potential.cu
   kernels/symmetrize_mat.cu
   kernels/increment_exc_grad.cu
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx
index fc081817..4e01b373 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx
@@ -1,13 +1,18 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #include "cuda_aos_scheme1.hpp"
 #include "device/cuda/cuda_backend.hpp"
 #include "cuda_aos_scheme1_weights.hpp"
+#include "device/common/device_blas.hpp"
 
 namespace GauXC {
 
@@ -37,6 +42,41 @@ void CudaAoSScheme1<Base>::partition_weights( XCDeviceData* _data ) {
     scheme1_stack.dist_nearest_device, base_stack.weights_device, *device_backend->master_stream );
 }
 
+template <typename Base> 
+void CudaAoSScheme1<Base>::eval_weight_1st_deriv_contracted( XCDeviceData* _data, XCWeightAlg alg ) {
+  if( alg != XCWeightAlg::SSF ) {
+    GAUXC_GENERIC_EXCEPTION("Weight Algorithm NYI for CUDA AoS Scheme1");
+  }
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  auto device_backend = dynamic_cast<CUDABackend*>(data->device_backend_);
+  if( !device_backend ) GAUXC_BAD_BACKEND_CAST();
+
+  // make w times f vector
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  auto base_stack    = data->base_stack;
+  if( is_pol )
+    increment( data->device_backend_->master_blas_handle(), base_stack.den_z_eval_device, 
+    base_stack.den_s_eval_device, data->total_npts_task_batch ); 
+
+  hadamard_product(data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, base_stack.den_s_eval_device, 1, 
+    base_stack.eps_eval_device, 1); 
+
+
+  // Compute distances from grid to atomic centers
+  const auto ldatoms = data->get_ldatoms();
+  auto static_stack  = data->static_stack;
+  auto scheme1_stack = data->scheme1_stack;
+  cuda_aos_scheme1_weight_1st_deriv_wrapper( data->total_npts_task_batch, data->global_dims.natoms,
+    base_stack.points_x_device, base_stack.points_y_device, base_stack.points_z_device,
+    static_stack.rab_device, ldatoms, static_stack.coords_device, 
+    scheme1_stack.dist_scratch_device, ldatoms, scheme1_stack.iparent_device, 
+    scheme1_stack.dist_nearest_device, base_stack.eps_eval_device, static_stack.exc_grad_device, *device_backend->master_stream );
+}
+
 
 template struct CudaAoSScheme1<AoSScheme1Base>;
 #ifdef GAUXC_HAS_MAGMA
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp
index aa431f72..cba52e14 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -33,6 +37,7 @@ struct CudaAoSScheme1 : public Base {
 
   // API Overrides
   void partition_weights( XCDeviceData* ) override final;
+  void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg ) override final;
 
   std::unique_ptr<XCDeviceData> create_device_data(const DeviceRuntimeEnvironment&) override final;
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx
index 8640fffe..9da703a9 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu
index 7e74225c..deeba830 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -33,7 +37,7 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms,
   compute_grid_to_center_dist( npts, natoms, coords, points_x, points_y, points_z, 
    dist, lddist, stream );
 
-#if 1
+#if 0
   // Get the number of SM's on the device
   int num_sm;
   int dev_id = 0;
@@ -49,11 +53,32 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms,
       weights
     );
 #else
-  partition_weights_ssf_1d( npts, natoms, RAB, natoms, coords, dist, lddist,
+  partition_weights_ssf_1d( npts, natoms, RAB, ldRAB, coords, dist, lddist,
     iparent, dist_nearest, weights, stream );
 #endif
 
+}
+
+
+void cuda_aos_scheme1_weight_1st_deriv_wrapper(
+  int32_t npts, int32_t natoms,
+  const double* points_x, const double* points_y, const double* points_z,
+  const double* RAB, int32_t ldRAB, const double* coords, 
+  double* dist, int32_t lddist, const int32_t* iparent,
+  const double* dist_nearest, const double* w_times_f,
+  double* exc_grad_w, cudaStream_t stream ){
+
+  // Compute distances from grid to atomic centers
+  compute_grid_to_center_dist( npts, natoms, coords, points_x, points_y, points_z, 
+   dist, lddist, stream );
+
+  eval_weight_1st_deriv_contracted_ssf_1d( npts, natoms, RAB, ldRAB, coords, points_x, points_y, points_z, dist, lddist,
+    iparent, dist_nearest, w_times_f, exc_grad_w, stream );
 
 }
 
+
+
+
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp
index 74674587..affd940f 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -13,4 +17,12 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms,
   double* dist, int32_t lddist, const int32_t* iparent,
   const double* dist_nearest, double* weights, cudaStream_t stream );
 
+void cuda_aos_scheme1_weight_1st_deriv_wrapper(
+  int32_t npts, int32_t natoms,
+  const double* points_x, const double* points_y, const double* points_z,
+  const double* RAB, int32_t ldRAB, const double* coords, 
+  double* dist, int32_t lddist, const int32_t* iparent,
+  const double* dist_nearest, const double* w_times_f,
+  double* exc_grad_w, cudaStream_t stream );
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp
index 08015c07..2ef18899 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp
index f948c4fb..71b17b60 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp
index b6f4ad63..216a6326 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp
index 39062a03..81968490 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp
index f8a97e8c..17b201a4 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -105,6 +109,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp
index 9c6dcbfe..627a7936 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,7 +102,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -119,6 +123,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp
index f6b3e63d..7543270b 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_0(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_0(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,7 +111,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha_squared*(x*x); 
+      const auto x1 = radial_eval_alpha_squared*x; 
+      const auto x2 = radial_eval_alpha_squared*(y*y); 
+      const auto x3 = radial_eval_alpha_squared*(z*z); 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -125,22 +133,24 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*x*x;
+      basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x0;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y;
+      basis_xy_eval[ipt + 0*npts] = x1*y;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*z;
+      basis_xz_eval[ipt + 0*npts] = x1*z;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*y*y;
+      basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x2;
 
       // Evaluate second derivative of bfn wrt yz
       basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*y*z;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*z*z;
+      basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x3;
+
+
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp
new file mode 100644
index 00000000..d12623d1
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp
@@ -0,0 +1,208 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_0(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = radial_eval_alpha_squared*x0; 
+      const auto x2 = radial_eval_alpha_squared*x; 
+      const auto x3 = y*y; 
+      const auto x4 = radial_eval_alpha_squared*x3; 
+      const auto x5 = radial_eval_alpha_squared*y; 
+      const auto x6 = z*z; 
+      const auto x7 = radial_eval_alpha_squared*x6; 
+      const auto x8 = radial_eval_alpha_cubed*x; 
+      const auto x9 = radial_eval_alpha_cubed*y; 
+      const auto x10 = radial_eval_alpha_cubed*z; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*y;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x1;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x2*y;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x2*z;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x4;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x5*z;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x7;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x1 + x4 + x7;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(x*x*x) + 5.0*x2 + x3*x8 + x6*x8;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(y*y*y) + x0*x9 + 5.0*x5 + x6*x9;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(z*z*z) + 5.0*radial_eval_alpha_squared*z + x0*x10 + x10*x3;
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+
+
+      ang_eval_0 = radial_eval;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+
+      dang_eval_x_0 = radial_eval_alpha*x;
+      dang_eval_y_0 = radial_eval_alpha*y;
+      dang_eval_z_0 = radial_eval_alpha*z;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp
index 06bfc86a..6fe1ec2d 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_0(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_0(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,7 +106,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha_squared*(x*x); 
+      const auto x1 = radial_eval_alpha_squared*x; 
+      const auto x2 = radial_eval_alpha_squared*(y*y); 
+      const auto x3 = radial_eval_alpha_squared*(z*z); 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -119,8 +127,10 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       // Evaluate first derivative of bfn wrt z
       basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z;
 
+
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = 3*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z;
+      basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x0 + x2 + x3;
+
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp
index db904d1c..b2b4672d 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*x;
@@ -107,6 +111,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp
index 6838f2fa..2aaabc26 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_1(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_1(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,7 +102,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha*x; 
+      const auto x1 = x0*y; 
+      const auto x2 = x0*z; 
+      const auto x3 = radial_eval_alpha*y*z; 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*x;
@@ -109,19 +117,21 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x*x;
-      basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*y;
-      basis_x_eval[ipt + 2*npts] = radial_eval_alpha*x*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*(x*x);
+      basis_x_eval[ipt + 1*npts] = x1;
+      basis_x_eval[ipt + 2*npts] = x2;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*y;
-      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*y*y;
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*y*z;
+      basis_y_eval[ipt + 0*npts] = x1;
+      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*(y*y);
+      basis_y_eval[ipt + 2*npts] = x3;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*y*z;
-      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*z*z;
+      basis_z_eval[ipt + 0*npts] = x2;
+      basis_z_eval[ipt + 1*npts] = x3;
+      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*(z*z);
+
+
 
 
 
@@ -149,15 +159,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
 
-      dang_eval_x_0 = radial_eval + radial_eval_alpha*x*x;
-      dang_eval_y_0 = radial_eval_alpha*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*z;
-      dang_eval_x_1 = radial_eval_alpha*x*y;
-      dang_eval_y_1 = radial_eval + radial_eval_alpha*y*y;
-      dang_eval_z_1 = radial_eval_alpha*y*z;
-      dang_eval_x_2 = radial_eval_alpha*x*z;
-      dang_eval_y_2 = radial_eval_alpha*y*z;
-      dang_eval_z_2 = radial_eval + radial_eval_alpha*z*z;
+      dang_eval_x_0 = radial_eval + radial_eval_alpha*(x*x);
+      dang_eval_y_0 = x1;
+      dang_eval_z_0 = x2;
+      dang_eval_x_1 = x1;
+      dang_eval_y_1 = radial_eval + radial_eval_alpha*(y*y);
+      dang_eval_z_1 = x3;
+      dang_eval_x_2 = x2;
+      dang_eval_y_2 = x3;
+      dang_eval_z_2 = radial_eval + radial_eval_alpha*(z*z);
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp
index 26f5bc7c..2047493d 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_1(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_1(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,7 +111,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = radial_eval_alpha*x; 
+      const auto x2 = x1*y; 
+      const auto x3 = x1*z; 
+      const auto x4 = y*y; 
+      const auto x5 = y*z; 
+      const auto x6 = radial_eval_alpha*x5; 
+      const auto x7 = z*z; 
+      const auto x8 = 3.0*radial_eval_alpha; 
+      const auto x9 = radial_eval_alpha_squared*x0; 
+      const auto x10 = radial_eval_alpha + x9; 
+      const auto x11 = x10*y; 
+      const auto x12 = x10*z; 
+      const auto x13 = radial_eval_alpha_squared*x4; 
+      const auto x14 = radial_eval_alpha + x13; 
+      const auto x15 = x*x14; 
+      const auto x16 = radial_eval_alpha_squared*x*x5; 
+      const auto x17 = radial_eval_alpha_squared*x7; 
+      const auto x18 = radial_eval_alpha + x17; 
+      const auto x19 = x*x18; 
+      const auto x20 = x14*z; 
+      const auto x21 = x18*y; 
+      const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x9; 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*x;
@@ -118,49 +145,51 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x*x;
-      basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*y;
-      basis_x_eval[ipt + 2*npts] = radial_eval_alpha*x*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x0;
+      basis_x_eval[ipt + 1*npts] = x2;
+      basis_x_eval[ipt + 2*npts] = x3;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*y;
-      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*y*y;
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*y*z;
+      basis_y_eval[ipt + 0*npts] = x2;
+      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x4;
+      basis_y_eval[ipt + 2*npts] = x6;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*y*z;
-      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*z*z;
+      basis_z_eval[ipt + 0*npts] = x3;
+      basis_z_eval[ipt + 1*npts] = x6;
+      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x7;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = x*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 1*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 2*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xx_eval[ipt + 0*npts] = x*(x8 + x9);
+      basis_xx_eval[ipt + 1*npts] = x11;
+      basis_xx_eval[ipt + 2*npts] = x12;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 1*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 2*npts] = radial_eval_alpha_squared*x*y*z;
+      basis_xy_eval[ipt + 0*npts] = x11;
+      basis_xy_eval[ipt + 1*npts] = x15;
+      basis_xy_eval[ipt + 2*npts] = x16;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 1*npts] = radial_eval_alpha_squared*x*y*z;
-      basis_xz_eval[ipt + 2*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_xz_eval[ipt + 0*npts] = x12;
+      basis_xz_eval[ipt + 1*npts] = x16;
+      basis_xz_eval[ipt + 2*npts] = x19;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 1*npts] = y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
+      basis_yy_eval[ipt + 0*npts] = x15;
+      basis_yy_eval[ipt + 1*npts] = y*(x13 + x8);
+      basis_yy_eval[ipt + 2*npts] = x20;
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y*z;
-      basis_yz_eval[ipt + 1*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 2*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_yz_eval[ipt + 0*npts] = x16;
+      basis_yz_eval[ipt + 1*npts] = x20;
+      basis_yz_eval[ipt + 2*npts] = x21;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 1*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_zz_eval[ipt + 0*npts] = x19;
+      basis_zz_eval[ipt + 1*npts] = x21;
+      basis_zz_eval[ipt + 2*npts] = z*(x17 + x8);
+
+
 
 
 
@@ -187,15 +216,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
 
-      dang_eval_x_0 = radial_eval + radial_eval_alpha*x*x;
-      dang_eval_y_0 = radial_eval_alpha*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*z;
-      dang_eval_x_1 = radial_eval_alpha*x*y;
-      dang_eval_y_1 = radial_eval + radial_eval_alpha*y*y;
-      dang_eval_z_1 = radial_eval_alpha*y*z;
-      dang_eval_x_2 = radial_eval_alpha*x*z;
-      dang_eval_y_2 = radial_eval_alpha*y*z;
-      dang_eval_z_2 = radial_eval + radial_eval_alpha*z*z;
+      dang_eval_x_0 = radial_eval + radial_eval_alpha*x0;
+      dang_eval_y_0 = x2;
+      dang_eval_z_0 = x3;
+      dang_eval_x_1 = x2;
+      dang_eval_y_1 = radial_eval + radial_eval_alpha*x4;
+      dang_eval_z_1 = x6;
+      dang_eval_x_2 = x3;
+      dang_eval_y_2 = x6;
+      dang_eval_z_2 = radial_eval + radial_eval_alpha*x7;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp
new file mode 100644
index 00000000..9e6ea4c5
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp
@@ -0,0 +1,285 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_1(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = x0; 
+      const auto x2 = radial_eval_alpha*x; 
+      const auto x3 = x2*y; 
+      const auto x4 = x2*z; 
+      const auto x5 = y*y; 
+      const auto x6 = x5; 
+      const auto x7 = y*z; 
+      const auto x8 = radial_eval_alpha*x7; 
+      const auto x9 = z*z; 
+      const auto x10 = x9; 
+      const auto x11 = 3.0*radial_eval_alpha; 
+      const auto x12 = radial_eval_alpha_squared*x1; 
+      const auto x13 = radial_eval_alpha + x12; 
+      const auto x14 = x13*y; 
+      const auto x15 = x13*z; 
+      const auto x16 = radial_eval_alpha_squared*x6; 
+      const auto x17 = radial_eval_alpha + x16; 
+      const auto x18 = x*x17; 
+      const auto x19 = radial_eval_alpha_squared*x*x7; 
+      const auto x20 = radial_eval_alpha_squared*x10; 
+      const auto x21 = radial_eval_alpha + x20; 
+      const auto x22 = x*x21; 
+      const auto x23 = x17*z; 
+      const auto x24 = x21*y; 
+      const auto x25 = 5.0*radial_eval_alpha; 
+      const auto x26 = x16 + x20 + x25; 
+      const auto x27 = x12 + x26; 
+      const auto x28 = 3.0*radial_eval_alpha_squared; 
+      const auto x29 = radial_eval_alpha_cubed*(x*x*x); 
+      const auto x30 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; 
+      const auto x31 = radial_eval_alpha_cubed*x10 + radial_eval_alpha_squared; 
+      const auto x32 = 5.0*radial_eval_alpha_squared; 
+      const auto x33 = x*x30 + x*x31 + x*x32 + x29; 
+      const auto x34 = radial_eval_alpha_cubed*(y*y*y); 
+      const auto x35 = radial_eval_alpha_cubed*x1 + radial_eval_alpha_squared; 
+      const auto x36 = x31*y + x32*y + x34 + x35*y; 
+      const auto x37 = x12 + x25; 
+      const auto x38 = radial_eval_alpha_cubed*(z*z*z); 
+      const auto x39 = x30*z + x32*z + x35*z + x38; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x;
+      basis_eval[ipt + 1*npts] = radial_eval*y;
+      basis_eval[ipt + 2*npts] = radial_eval*z;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x1;
+      basis_x_eval[ipt + 1*npts] = x3;
+      basis_x_eval[ipt + 2*npts] = x4;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x3;
+      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x6;
+      basis_y_eval[ipt + 2*npts] = x8;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x4;
+      basis_z_eval[ipt + 1*npts] = x8;
+      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x10;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x*(x11 + x12);
+      basis_xx_eval[ipt + 1*npts] = x14;
+      basis_xx_eval[ipt + 2*npts] = x15;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x14;
+      basis_xy_eval[ipt + 1*npts] = x18;
+      basis_xy_eval[ipt + 2*npts] = x19;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x15;
+      basis_xz_eval[ipt + 1*npts] = x19;
+      basis_xz_eval[ipt + 2*npts] = x22;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x18;
+      basis_yy_eval[ipt + 1*npts] = y*(x11 + x16);
+      basis_yy_eval[ipt + 2*npts] = x23;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x19;
+      basis_yz_eval[ipt + 1*npts] = x23;
+      basis_yz_eval[ipt + 2*npts] = x24;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x22;
+      basis_zz_eval[ipt + 1*npts] = x24;
+      basis_zz_eval[ipt + 2*npts] = z*(x11 + x20);
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x*x27;
+      basis_lapl_eval[ipt + 1*npts] = x27*y;
+      basis_lapl_eval[ipt + 2*npts] = x27*z;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x*(x*x28 + x29) + x0*x30 + x0*x31 + x1*x28 + x26;
+      basis_lapl_x_eval[ipt + 1*npts] = x33*y;
+      basis_lapl_x_eval[ipt + 2*npts] = x33*z;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x*x36;
+      basis_lapl_y_eval[ipt + 1*npts] = x20 + x28*x6 + x31*x5 + x35*x5 + x37 + y*(x28*y + x34);
+      basis_lapl_y_eval[ipt + 2*npts] = x36*z;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x*x39;
+      basis_lapl_z_eval[ipt + 1*npts] = x39*y;
+      basis_lapl_z_eval[ipt + 2*npts] = x10*x28 + x16 + x30*x9 + x35*x9 + x37 + z*(x28*z + x38);
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+
+
+      ang_eval_0 = radial_eval*x;
+      ang_eval_1 = radial_eval*y;
+      ang_eval_2 = radial_eval*z;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+
+      dang_eval_x_0 = radial_eval + radial_eval_alpha*x1;
+      dang_eval_y_0 = x3;
+      dang_eval_z_0 = x4;
+      dang_eval_x_1 = x3;
+      dang_eval_y_1 = radial_eval + radial_eval_alpha*x6;
+      dang_eval_z_1 = x8;
+      dang_eval_x_2 = x4;
+      dang_eval_y_2 = x8;
+      dang_eval_z_2 = radial_eval + radial_eval_alpha*x10;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp
index aa5cb4de..ae4d6cc5 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_1(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_1(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,7 +106,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = radial_eval_alpha*x; 
+      const auto x2 = x1*y; 
+      const auto x3 = x1*z; 
+      const auto x4 = y*y; 
+      const auto x5 = y*z; 
+      const auto x6 = radial_eval_alpha*x5; 
+      const auto x7 = z*z; 
+      const auto x8 = 3.0*radial_eval_alpha; 
+      const auto x9 = radial_eval_alpha_squared*x0; 
+      const auto x10 = radial_eval_alpha + x9; 
+      const auto x11 = x10*y; 
+      const auto x12 = x10*z; 
+      const auto x13 = radial_eval_alpha_squared*x4; 
+      const auto x14 = radial_eval_alpha + x13; 
+      const auto x15 = x*x14; 
+      const auto x16 = radial_eval_alpha_squared*x*x5; 
+      const auto x17 = radial_eval_alpha_squared*x7; 
+      const auto x18 = radial_eval_alpha + x17; 
+      const auto x19 = x*x18; 
+      const auto x20 = x14*z; 
+      const auto x21 = x18*y; 
+      const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x9; 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*x;
@@ -113,24 +140,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x*x;
-      basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*y;
-      basis_x_eval[ipt + 2*npts] = radial_eval_alpha*x*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x0;
+      basis_x_eval[ipt + 1*npts] = x2;
+      basis_x_eval[ipt + 2*npts] = x3;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*y;
-      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*y*y;
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*y*z;
+      basis_y_eval[ipt + 0*npts] = x2;
+      basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x4;
+      basis_y_eval[ipt + 2*npts] = x6;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*y*z;
-      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*z*z;
+      basis_z_eval[ipt + 0*npts] = x3;
+      basis_z_eval[ipt + 1*npts] = x6;
+      basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x7;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = x*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 1*npts] = y*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 2*npts] = z*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
+      basis_lapl_eval[ipt + 0*npts] = x*x22;
+      basis_lapl_eval[ipt + 1*npts] = x22*y;
+      basis_lapl_eval[ipt + 2*npts] = x22*z;
+
 
 
 
@@ -157,15 +186,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
 
-      dang_eval_x_0 = radial_eval + radial_eval_alpha*x*x;
-      dang_eval_y_0 = radial_eval_alpha*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*z;
-      dang_eval_x_1 = radial_eval_alpha*x*y;
-      dang_eval_y_1 = radial_eval + radial_eval_alpha*y*y;
-      dang_eval_z_1 = radial_eval_alpha*y*z;
-      dang_eval_x_2 = radial_eval_alpha*x*z;
-      dang_eval_y_2 = radial_eval_alpha*y*z;
-      dang_eval_z_2 = radial_eval + radial_eval_alpha*z*z;
+      dang_eval_x_0 = radial_eval + radial_eval_alpha*x0;
+      dang_eval_y_0 = x2;
+      dang_eval_z_0 = x3;
+      dang_eval_x_1 = x2;
+      dang_eval_y_1 = radial_eval + radial_eval_alpha*x4;
+      dang_eval_z_1 = x6;
+      dang_eval_x_2 = x3;
+      dang_eval_y_2 = x6;
+      dang_eval_z_2 = radial_eval + radial_eval_alpha*x7;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp
index b50b7c21..504a0c4a 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,15 +96,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval*x; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*y*y;
+      basis_eval[ipt + 0*npts] = radial_eval*(x*x);
+      basis_eval[ipt + 1*npts] = x0*y;
+      basis_eval[ipt + 2*npts] = x0*z;
+      basis_eval[ipt + 3*npts] = radial_eval*(y*y);
       basis_eval[ipt + 4*npts] = radial_eval*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*z*z;
+      basis_eval[ipt + 5*npts] = radial_eval*(z*z);
 
 
     
@@ -110,6 +115,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
@@ -121,17 +128,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x;
-      ang_eval_1 = radial_eval*x*y;
-      ang_eval_2 = radial_eval*x*z;
-      ang_eval_3 = radial_eval*y*y;
+      ang_eval_0 = radial_eval*(x*x);
+      ang_eval_1 = x0*y;
+      ang_eval_2 = x0*z;
+      ang_eval_3 = radial_eval*(y*y);
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
       ang_eval_0 = radial_eval*y*z;
-      ang_eval_1 = radial_eval*z*z;
+      ang_eval_1 = radial_eval*(z*z);
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp
index 4c640ddd..8eb1cdc6 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_2(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_2(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,41 +102,56 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = radial_eval*x; 
+      const auto x2 = y*y; 
+      const auto x3 = radial_eval*y; 
+      const auto x4 = z*z; 
+      const auto x5 = radial_eval + radial_eval_alpha*x0; 
+      const auto x6 = radial_eval_alpha*x; 
+      const auto x7 = x6*y*z; 
+      const auto x8 = radial_eval_alpha*y; 
+      const auto x9 = radial_eval + radial_eval_alpha*x2; 
+      const auto x10 = radial_eval_alpha*z; 
+      const auto x11 = radial_eval + radial_eval_alpha*x4; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*y;
+      basis_eval[ipt + 2*npts] = x1*z;
+      basis_eval[ipt + 3*npts] = radial_eval*x2;
+      basis_eval[ipt + 4*npts] = x3*z;
+      basis_eval[ipt + 5*npts] = radial_eval*x4;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y;
-      basis_x_eval[ipt + 4*npts] = radial_eval_alpha*x*y*z;
-      basis_x_eval[ipt + 5*npts] = radial_eval_alpha*x*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*(x*x*x) + 2.0*x1;
+      basis_x_eval[ipt + 1*npts] = x5*y;
+      basis_x_eval[ipt + 2*npts] = x5*z;
+      basis_x_eval[ipt + 3*npts] = x2*x6;
+      basis_x_eval[ipt + 4*npts] = x7;
+      basis_x_eval[ipt + 5*npts] = x4*x6;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y*z;
-      basis_y_eval[ipt + 3*npts] = y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*y*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x8;
+      basis_y_eval[ipt + 1*npts] = x*x9;
+      basis_y_eval[ipt + 2*npts] = x7;
+      basis_y_eval[ipt + 3*npts] = radial_eval_alpha*(y*y*y) + 2.0*x3;
+      basis_y_eval[ipt + 4*npts] = x9*z;
+      basis_y_eval[ipt + 5*npts] = x4*x8;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*y*y*z;
-      basis_z_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = z*(2*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x10;
+      basis_z_eval[ipt + 1*npts] = x7;
+      basis_z_eval[ipt + 2*npts] = x*x11;
+      basis_z_eval[ipt + 3*npts] = x10*x2;
+      basis_z_eval[ipt + 4*npts] = x11*y;
+      basis_z_eval[ipt + 5*npts] = z*(2.0*radial_eval + radial_eval_alpha*(z*z));
+
+
 
 
 
@@ -150,17 +168,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x;
-      ang_eval_1 = radial_eval*x*y;
-      ang_eval_2 = radial_eval*x*z;
-      ang_eval_3 = radial_eval*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*y;
+      ang_eval_2 = x1*z;
+      ang_eval_3 = radial_eval*x2;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z;
-      ang_eval_1 = radial_eval*z*z;
+      ang_eval_0 = x3*z;
+      ang_eval_1 = radial_eval*x4;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
 
@@ -170,18 +188,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*z;
-      dang_eval_x_1 = y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*y*z;
-      dang_eval_x_2 = z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*y*z;
-      dang_eval_z_2 = x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = radial_eval_alpha*x*y*y;
-      dang_eval_y_3 = y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*(x*x*x) + 2.0*x1;
+      dang_eval_y_0 = x0*x8;
+      dang_eval_z_0 = x0*x10;
+      dang_eval_x_1 = x5*y;
+      dang_eval_y_1 = x*x9;
+      dang_eval_z_1 = x7;
+      dang_eval_x_2 = x5*z;
+      dang_eval_y_2 = x7;
+      dang_eval_z_2 = x*x11;
+      dang_eval_x_3 = x2*x6;
+      dang_eval_y_3 = radial_eval_alpha*(y*y*y) + 2.0*x3;
+      dang_eval_z_3 = x10*x2;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -195,12 +213,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*z;
-      dang_eval_y_0 = z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*z*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z*z;
-      dang_eval_z_1 = z*(2*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x7;
+      dang_eval_y_0 = x9*z;
+      dang_eval_z_0 = x11*y;
+      dang_eval_x_1 = x4*x6;
+      dang_eval_y_1 = x4*x8;
+      dang_eval_z_1 = z*(2.0*radial_eval + radial_eval_alpha*(z*z));
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp
index d4b05d5e..8c76ac54 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_2(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_2(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,89 +111,138 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = x*y; 
+      const auto x2 = x*z; 
+      const auto x3 = y*y; 
+      const auto x4 = y*z; 
+      const auto x5 = z*z; 
+      const auto x6 = 2.0*radial_eval; 
+      const auto x7 = x*x*x; 
+      const auto x8 = radial_eval + radial_eval_alpha*x0; 
+      const auto x9 = radial_eval_alpha*x; 
+      const auto x10 = x4*x9; 
+      const auto x11 = radial_eval_alpha*y; 
+      const auto x12 = radial_eval_alpha*x3; 
+      const auto x13 = radial_eval + x12; 
+      const auto x14 = y*y*y; 
+      const auto x15 = radial_eval_alpha*z; 
+      const auto x16 = radial_eval_alpha*x5; 
+      const auto x17 = radial_eval + x16; 
+      const auto x18 = z*z*z; 
+      const auto x19 = 4.0*radial_eval_alpha; 
+      const auto x20 = radial_eval_alpha_squared*x0; 
+      const auto x21 = radial_eval_alpha + x20; 
+      const auto x22 = x0*x19 + x0*x21 + x6; 
+      const auto x23 = 3.0*radial_eval_alpha; 
+      const auto x24 = x20 + x23; 
+      const auto x25 = x21*x3; 
+      const auto x26 = x21*x4; 
+      const auto x27 = x21*x5; 
+      const auto x28 = radial_eval_alpha_squared*x7 + 2.0*x9; 
+      const auto x29 = radial_eval_alpha_squared*x14 + 2.0*x11; 
+      const auto x30 = radial_eval_alpha_squared*x3; 
+      const auto x31 = radial_eval_alpha + x30; 
+      const auto x32 = x2*x31; 
+      const auto x33 = radial_eval_alpha_squared*x5; 
+      const auto x34 = radial_eval_alpha + x33; 
+      const auto x35 = x1*x34; 
+      const auto x36 = radial_eval_alpha_squared*x18 + 2.0*x15; 
+      const auto x37 = x0*x31; 
+      const auto x38 = x23 + x30; 
+      const auto x39 = x19*x3 + x3*x31 + x6; 
+      const auto x40 = x31*x5; 
+      const auto x41 = x0*x34; 
+      const auto x42 = x23 + x33; 
+      const auto x43 = x3*x34; 
+      const auto x44 = x19*x5 + x34*x5 + x6; 
+      const auto x45 = 7.0*radial_eval_alpha + x20 + x30 + x33; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = radial_eval*x1;
+      basis_eval[ipt + 2*npts] = radial_eval*x2;
+      basis_eval[ipt + 3*npts] = radial_eval*x3;
+      basis_eval[ipt + 4*npts] = radial_eval*x4;
+      basis_eval[ipt + 5*npts] = radial_eval*x5;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y;
-      basis_x_eval[ipt + 4*npts] = radial_eval_alpha*x*y*z;
-      basis_x_eval[ipt + 5*npts] = radial_eval_alpha*x*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x7 + x*x6;
+      basis_x_eval[ipt + 1*npts] = x8*y;
+      basis_x_eval[ipt + 2*npts] = x8*z;
+      basis_x_eval[ipt + 3*npts] = x3*x9;
+      basis_x_eval[ipt + 4*npts] = x10;
+      basis_x_eval[ipt + 5*npts] = x5*x9;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y*z;
-      basis_y_eval[ipt + 3*npts] = y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*y*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x11;
+      basis_y_eval[ipt + 1*npts] = x*x13;
+      basis_y_eval[ipt + 2*npts] = x10;
+      basis_y_eval[ipt + 3*npts] = radial_eval_alpha*x14 + x6*y;
+      basis_y_eval[ipt + 4*npts] = x13*z;
+      basis_y_eval[ipt + 5*npts] = x11*x5;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*y*y*z;
-      basis_z_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = z*(2*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x15;
+      basis_z_eval[ipt + 1*npts] = x10;
+      basis_z_eval[ipt + 2*npts] = x*x17;
+      basis_z_eval[ipt + 3*npts] = x15*x3;
+      basis_z_eval[ipt + 4*npts] = x17*y;
+      basis_z_eval[ipt + 5*npts] = radial_eval_alpha*x18 + x6*z;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = 2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x;
-      basis_xx_eval[ipt + 1*npts] = x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 2*npts] = x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 3*npts] = y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 4*npts] = y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 5*npts] = z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xx_eval[ipt + 0*npts] = x22;
+      basis_xx_eval[ipt + 1*npts] = x1*x24;
+      basis_xx_eval[ipt + 2*npts] = x2*x24;
+      basis_xx_eval[ipt + 3*npts] = x25;
+      basis_xx_eval[ipt + 4*npts] = x26;
+      basis_xx_eval[ipt + 5*npts] = x27;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = x*y*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y;
-      basis_xy_eval[ipt + 2*npts] = y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 3*npts] = x*y*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 4*npts] = x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x*y*z*z;
+      basis_xy_eval[ipt + 0*npts] = x28*y;
+      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x3 + x12 + x8;
+      basis_xy_eval[ipt + 2*npts] = x26;
+      basis_xy_eval[ipt + 3*npts] = x*x29;
+      basis_xy_eval[ipt + 4*npts] = x32;
+      basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x1*x5;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = x*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 1*npts] = y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z;
-      basis_xz_eval[ipt + 3*npts] = radial_eval_alpha_squared*x*y*y*z;
-      basis_xz_eval[ipt + 4*npts] = x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 5*npts] = x*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_xz_eval[ipt + 0*npts] = x28*z;
+      basis_xz_eval[ipt + 1*npts] = x26;
+      basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x5 + x16 + x8;
+      basis_xz_eval[ipt + 3*npts] = radial_eval_alpha_squared*x2*x3;
+      basis_xz_eval[ipt + 4*npts] = x35;
+      basis_xz_eval[ipt + 5*npts] = x*x36;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 1*npts] = x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 3*npts] = 2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y;
-      basis_yy_eval[ipt + 4*npts] = y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 5*npts] = z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
+      basis_yy_eval[ipt + 0*npts] = x37;
+      basis_yy_eval[ipt + 1*npts] = x1*x38;
+      basis_yy_eval[ipt + 2*npts] = x32;
+      basis_yy_eval[ipt + 3*npts] = x39;
+      basis_yy_eval[ipt + 4*npts] = x38*x4;
+      basis_yy_eval[ipt + 5*npts] = x40;
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*y*z;
-      basis_yz_eval[ipt + 1*npts] = x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 2*npts] = x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 3*npts] = y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 4*npts] = radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z;
-      basis_yz_eval[ipt + 5*npts] = y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x4;
+      basis_yz_eval[ipt + 1*npts] = x32;
+      basis_yz_eval[ipt + 2*npts] = x35;
+      basis_yz_eval[ipt + 3*npts] = x29*z;
+      basis_yz_eval[ipt + 4*npts] = radial_eval_alpha_squared*x3*x5 + x13 + x16;
+      basis_yz_eval[ipt + 5*npts] = x36*y;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 1*npts] = x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 3*npts] = y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 4*npts] = y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 5*npts] = 2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z;
+      basis_zz_eval[ipt + 0*npts] = x41;
+      basis_zz_eval[ipt + 1*npts] = x35;
+      basis_zz_eval[ipt + 2*npts] = x2*x42;
+      basis_zz_eval[ipt + 3*npts] = x43;
+      basis_zz_eval[ipt + 4*npts] = x4*x42;
+      basis_zz_eval[ipt + 5*npts] = x44;
+
+
 
 
 
@@ -206,17 +258,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x;
-      ang_eval_1 = radial_eval*x*y;
-      ang_eval_2 = radial_eval*x*z;
-      ang_eval_3 = radial_eval*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = radial_eval*x1;
+      ang_eval_2 = radial_eval*x2;
+      ang_eval_3 = radial_eval*x3;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z;
-      ang_eval_1 = radial_eval*z*z;
+      ang_eval_0 = radial_eval*x4;
+      ang_eval_1 = radial_eval*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
 
@@ -226,18 +278,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*z;
-      dang_eval_x_1 = y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*y*z;
-      dang_eval_x_2 = z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*y*z;
-      dang_eval_z_2 = x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = radial_eval_alpha*x*y*y;
-      dang_eval_y_3 = y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*x7 + x*x6;
+      dang_eval_y_0 = x0*x11;
+      dang_eval_z_0 = x0*x15;
+      dang_eval_x_1 = x8*y;
+      dang_eval_y_1 = x*x13;
+      dang_eval_z_1 = x10;
+      dang_eval_x_2 = x8*z;
+      dang_eval_y_2 = x10;
+      dang_eval_z_2 = x*x17;
+      dang_eval_x_3 = x3*x9;
+      dang_eval_y_3 = radial_eval_alpha*x14 + x6*y;
+      dang_eval_z_3 = x15*x3;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -251,12 +303,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*z;
-      dang_eval_y_0 = z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*z*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z*z;
-      dang_eval_z_1 = z*(2*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x10;
+      dang_eval_y_0 = x13*z;
+      dang_eval_z_0 = x17*y;
+      dang_eval_x_1 = x5*x9;
+      dang_eval_y_1 = x11*x5;
+      dang_eval_z_1 = radial_eval_alpha*x18 + x6*z;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp
new file mode 100644
index 00000000..faa65ea8
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp
@@ -0,0 +1,400 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_2(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = x0; 
+      const auto x2 = x*y; 
+      const auto x3 = x*z; 
+      const auto x4 = y*y; 
+      const auto x5 = x4; 
+      const auto x6 = y*z; 
+      const auto x7 = z*z; 
+      const auto x8 = x7; 
+      const auto x9 = 2.0*radial_eval; 
+      const auto x10 = x*x*x; 
+      const auto x11 = radial_eval + radial_eval_alpha*x1; 
+      const auto x12 = radial_eval_alpha*x; 
+      const auto x13 = x12*x6; 
+      const auto x14 = radial_eval_alpha*y; 
+      const auto x15 = radial_eval_alpha*x5; 
+      const auto x16 = radial_eval + x15; 
+      const auto x17 = y*y*y; 
+      const auto x18 = radial_eval_alpha*z; 
+      const auto x19 = radial_eval_alpha*x8; 
+      const auto x20 = radial_eval + x19; 
+      const auto x21 = z*z*z; 
+      const auto x22 = 4.0*radial_eval_alpha; 
+      const auto x23 = radial_eval_alpha_squared*x1; 
+      const auto x24 = radial_eval_alpha + x23; 
+      const auto x25 = x1*x22 + x1*x24 + x9; 
+      const auto x26 = 3.0*radial_eval_alpha; 
+      const auto x27 = x23 + x26; 
+      const auto x28 = x24*x5; 
+      const auto x29 = x24*x6; 
+      const auto x30 = x24*x8; 
+      const auto x31 = 2.0*x12; 
+      const auto x32 = radial_eval_alpha_squared*x10 + x31; 
+      const auto x33 = 2.0*x14; 
+      const auto x34 = radial_eval_alpha_squared*x17 + x33; 
+      const auto x35 = radial_eval_alpha_squared*x5; 
+      const auto x36 = radial_eval_alpha + x35; 
+      const auto x37 = x3*x36; 
+      const auto x38 = radial_eval_alpha_squared*x8; 
+      const auto x39 = radial_eval_alpha + x38; 
+      const auto x40 = x2*x39; 
+      const auto x41 = 2.0*x18; 
+      const auto x42 = radial_eval_alpha_squared*x21 + x41; 
+      const auto x43 = x1*x36; 
+      const auto x44 = x26 + x35; 
+      const auto x45 = x22*x5 + x36*x5 + x9; 
+      const auto x46 = x36*x8; 
+      const auto x47 = x1*x39; 
+      const auto x48 = x26 + x38; 
+      const auto x49 = x39*x5; 
+      const auto x50 = x22*x8 + x39*x8 + x9; 
+      const auto x51 = x35 + x38; 
+      const auto x52 = 7.0*radial_eval_alpha + x23 + x51; 
+      const auto x53 = 2.0*x; 
+      const auto x54 = radial_eval_alpha_cubed*x5 + radial_eval_alpha_squared; 
+      const auto x55 = x1*x54; 
+      const auto x56 = radial_eval_alpha_cubed*x8 + radial_eval_alpha_squared; 
+      const auto x57 = x1*x56; 
+      const auto x58 = radial_eval_alpha_squared*x; 
+      const auto x59 = radial_eval_alpha_cubed*x10; 
+      const auto x60 = 3.0*x58 + x59; 
+      const auto x61 = 2.0*radial_eval_alpha_squared; 
+      const auto x62 = x*x60 + x0*x54 + x0*x56 + x1*x61 + x22 + 3.0*x24 + x51; 
+      const auto x63 = 4.0*x58; 
+      const auto x64 = x5*x54; 
+      const auto x65 = x5*x56; 
+      const auto x66 = x54*x8; 
+      const auto x67 = x56*x8; 
+      const auto x68 = radial_eval_alpha_squared*y; 
+      const auto x69 = 4.0*x68; 
+      const auto x70 = radial_eval_alpha_cubed*x1 + radial_eval_alpha_squared; 
+      const auto x71 = x1*x70; 
+      const auto x72 = radial_eval_alpha_cubed*x17; 
+      const auto x73 = 3.0*x68 + x72; 
+      const auto x74 = x22 + x23; 
+      const auto x75 = 3.0*x36 + x38 + x4*x56 + x4*x70 + x5*x61 + x73*y + x74; 
+      const auto x76 = 2.0*y; 
+      const auto x77 = x5*x70; 
+      const auto x78 = x70*x8; 
+      const auto x79 = radial_eval_alpha_squared*z; 
+      const auto x80 = 4.0*x79; 
+      const auto x81 = radial_eval_alpha_cubed*x21; 
+      const auto x82 = 3.0*x79 + x81; 
+      const auto x83 = x35 + 3.0*x39 + x54*x7 + x61*x8 + x7*x70 + x74 + x82*z; 
+      const auto x84 = 2.0*z; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x1;
+      basis_eval[ipt + 1*npts] = radial_eval*x2;
+      basis_eval[ipt + 2*npts] = radial_eval*x3;
+      basis_eval[ipt + 3*npts] = radial_eval*x5;
+      basis_eval[ipt + 4*npts] = radial_eval*x6;
+      basis_eval[ipt + 5*npts] = radial_eval*x8;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x10 + x*x9;
+      basis_x_eval[ipt + 1*npts] = x11*y;
+      basis_x_eval[ipt + 2*npts] = x11*z;
+      basis_x_eval[ipt + 3*npts] = x12*x5;
+      basis_x_eval[ipt + 4*npts] = x13;
+      basis_x_eval[ipt + 5*npts] = x12*x8;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x1*x14;
+      basis_y_eval[ipt + 1*npts] = x*x16;
+      basis_y_eval[ipt + 2*npts] = x13;
+      basis_y_eval[ipt + 3*npts] = radial_eval_alpha*x17 + x9*y;
+      basis_y_eval[ipt + 4*npts] = x16*z;
+      basis_y_eval[ipt + 5*npts] = x14*x8;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x1*x18;
+      basis_z_eval[ipt + 1*npts] = x13;
+      basis_z_eval[ipt + 2*npts] = x*x20;
+      basis_z_eval[ipt + 3*npts] = x18*x5;
+      basis_z_eval[ipt + 4*npts] = x20*y;
+      basis_z_eval[ipt + 5*npts] = radial_eval_alpha*x21 + x9*z;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x25;
+      basis_xx_eval[ipt + 1*npts] = x2*x27;
+      basis_xx_eval[ipt + 2*npts] = x27*x3;
+      basis_xx_eval[ipt + 3*npts] = x28;
+      basis_xx_eval[ipt + 4*npts] = x29;
+      basis_xx_eval[ipt + 5*npts] = x30;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x32*y;
+      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x1*x5 + x11 + x15;
+      basis_xy_eval[ipt + 2*npts] = x29;
+      basis_xy_eval[ipt + 3*npts] = x*x34;
+      basis_xy_eval[ipt + 4*npts] = x37;
+      basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x2*x8;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x32*z;
+      basis_xz_eval[ipt + 1*npts] = x29;
+      basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x1*x8 + x11 + x19;
+      basis_xz_eval[ipt + 3*npts] = radial_eval_alpha_squared*x3*x5;
+      basis_xz_eval[ipt + 4*npts] = x40;
+      basis_xz_eval[ipt + 5*npts] = x*x42;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x43;
+      basis_yy_eval[ipt + 1*npts] = x2*x44;
+      basis_yy_eval[ipt + 2*npts] = x37;
+      basis_yy_eval[ipt + 3*npts] = x45;
+      basis_yy_eval[ipt + 4*npts] = x44*x6;
+      basis_yy_eval[ipt + 5*npts] = x46;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x1*x6;
+      basis_yz_eval[ipt + 1*npts] = x37;
+      basis_yz_eval[ipt + 2*npts] = x40;
+      basis_yz_eval[ipt + 3*npts] = x34*z;
+      basis_yz_eval[ipt + 4*npts] = radial_eval_alpha_squared*x5*x8 + x16 + x19;
+      basis_yz_eval[ipt + 5*npts] = x42*y;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x47;
+      basis_zz_eval[ipt + 1*npts] = x40;
+      basis_zz_eval[ipt + 2*npts] = x3*x48;
+      basis_zz_eval[ipt + 3*npts] = x49;
+      basis_zz_eval[ipt + 4*npts] = x48*x6;
+      basis_zz_eval[ipt + 5*npts] = x50;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x25 + x43 + x47;
+      basis_lapl_eval[ipt + 1*npts] = x2*x52;
+      basis_lapl_eval[ipt + 2*npts] = x3*x52;
+      basis_lapl_eval[ipt + 3*npts] = x28 + x45 + x49;
+      basis_lapl_eval[ipt + 4*npts] = x52*x6;
+      basis_lapl_eval[ipt + 5*npts] = x30 + x46 + x50;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = 6.0*x*x24 + x*x55 + x*x57 + x1*x60 + 6.0*x12 + x36*x53 + x39*x53;
+      basis_lapl_x_eval[ipt + 1*npts] = x62*y;
+      basis_lapl_x_eval[ipt + 2*npts] = x62*z;
+      basis_lapl_x_eval[ipt + 3*npts] = x*x64 + x*x65 + x31 + x5*x60 + x5*x63;
+      basis_lapl_x_eval[ipt + 4*npts] = x6*(x*x54 + x*x56 + 7.0*x58 + x59);
+      basis_lapl_x_eval[ipt + 5*npts] = x*x66 + x*x67 + x31 + x60*x8 + x63*x8;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x1*x69 + x1*x73 + x33 + x57*y + x71*y;
+      basis_lapl_y_eval[ipt + 1*npts] = x*x75;
+      basis_lapl_y_eval[ipt + 2*npts] = x3*(x56*y + 7.0*x68 + x70*y + x72);
+      basis_lapl_y_eval[ipt + 3*npts] = 6.0*x14 + x24*x76 + 6.0*x36*y + x39*x76 + x5*x73 + x65*y + x77*y;
+      basis_lapl_y_eval[ipt + 4*npts] = x75*z;
+      basis_lapl_y_eval[ipt + 5*npts] = x33 + x67*y + x69*x8 + x73*x8 + x78*y;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x1*x80 + x1*x82 + x41 + x55*z + x71*z;
+      basis_lapl_z_eval[ipt + 1*npts] = x2*(x54*z + x70*z + 7.0*x79 + x81);
+      basis_lapl_z_eval[ipt + 2*npts] = x*x83;
+      basis_lapl_z_eval[ipt + 3*npts] = x41 + x5*x80 + x5*x82 + x64*z + x77*z;
+      basis_lapl_z_eval[ipt + 4*npts] = x83*y;
+      basis_lapl_z_eval[ipt + 5*npts] = 6.0*x18 + x24*x84 + x36*x84 + 6.0*x39*z + x66*z + x78*z + x8*x82;
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+      double ang_eval_3;
+
+
+      ang_eval_0 = radial_eval*x1;
+      ang_eval_1 = radial_eval*x2;
+      ang_eval_2 = radial_eval*x3;
+      ang_eval_3 = radial_eval*x5;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+      basis_eval[ipt + 3*npts] = ang_eval_3;
+
+      ang_eval_0 = radial_eval*x6;
+      ang_eval_1 = radial_eval*x8;
+      basis_eval[ipt + 4*npts] = ang_eval_0;
+      basis_eval[ipt + 5*npts] = ang_eval_1;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+      double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
+
+      dang_eval_x_0 = radial_eval_alpha*x10 + x*x9;
+      dang_eval_y_0 = x1*x14;
+      dang_eval_z_0 = x1*x18;
+      dang_eval_x_1 = x11*y;
+      dang_eval_y_1 = x*x16;
+      dang_eval_z_1 = x13;
+      dang_eval_x_2 = x11*z;
+      dang_eval_y_2 = x13;
+      dang_eval_z_2 = x*x20;
+      dang_eval_x_3 = x12*x5;
+      dang_eval_y_3 = radial_eval_alpha*x17 + x9*y;
+      dang_eval_z_3 = x18*x5;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 3*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x13;
+      dang_eval_y_0 = x16*z;
+      dang_eval_z_0 = x20*y;
+      dang_eval_x_1 = x12*x8;
+      dang_eval_y_1 = x14*x8;
+      dang_eval_z_1 = radial_eval_alpha*x21 + x9*z;
+      basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 5*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 5*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 5*npts] = dang_eval_z_1;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp
index 7e3b759f..e789b5ff 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_2(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_2(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,49 +106,98 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = x*y; 
+      const auto x2 = x*z; 
+      const auto x3 = y*y; 
+      const auto x4 = y*z; 
+      const auto x5 = z*z; 
+      const auto x6 = 2.0*radial_eval; 
+      const auto x7 = x*x*x; 
+      const auto x8 = radial_eval + radial_eval_alpha*x0; 
+      const auto x9 = radial_eval_alpha*x; 
+      const auto x10 = x4*x9; 
+      const auto x11 = radial_eval_alpha*y; 
+      const auto x12 = radial_eval_alpha*x3; 
+      const auto x13 = radial_eval + x12; 
+      const auto x14 = y*y*y; 
+      const auto x15 = radial_eval_alpha*z; 
+      const auto x16 = radial_eval_alpha*x5; 
+      const auto x17 = radial_eval + x16; 
+      const auto x18 = z*z*z; 
+      const auto x19 = 4.0*radial_eval_alpha; 
+      const auto x20 = radial_eval_alpha_squared*x0; 
+      const auto x21 = radial_eval_alpha + x20; 
+      const auto x22 = x0*x19 + x0*x21 + x6; 
+      const auto x23 = 3.0*radial_eval_alpha; 
+      const auto x24 = x20 + x23; 
+      const auto x25 = x21*x3; 
+      const auto x26 = x21*x4; 
+      const auto x27 = x21*x5; 
+      const auto x28 = radial_eval_alpha_squared*x7 + 2.0*x9; 
+      const auto x29 = radial_eval_alpha_squared*x14 + 2.0*x11; 
+      const auto x30 = radial_eval_alpha_squared*x3; 
+      const auto x31 = radial_eval_alpha + x30; 
+      const auto x32 = x2*x31; 
+      const auto x33 = radial_eval_alpha_squared*x5; 
+      const auto x34 = radial_eval_alpha + x33; 
+      const auto x35 = x1*x34; 
+      const auto x36 = radial_eval_alpha_squared*x18 + 2.0*x15; 
+      const auto x37 = x0*x31; 
+      const auto x38 = x23 + x30; 
+      const auto x39 = x19*x3 + x3*x31 + x6; 
+      const auto x40 = x31*x5; 
+      const auto x41 = x0*x34; 
+      const auto x42 = x23 + x33; 
+      const auto x43 = x3*x34; 
+      const auto x44 = x19*x5 + x34*x5 + x6; 
+      const auto x45 = 7.0*radial_eval_alpha + x20 + x30 + x33; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = radial_eval*x1;
+      basis_eval[ipt + 2*npts] = radial_eval*x2;
+      basis_eval[ipt + 3*npts] = radial_eval*x3;
+      basis_eval[ipt + 4*npts] = radial_eval*x4;
+      basis_eval[ipt + 5*npts] = radial_eval*x5;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y;
-      basis_x_eval[ipt + 4*npts] = radial_eval_alpha*x*y*z;
-      basis_x_eval[ipt + 5*npts] = radial_eval_alpha*x*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x7 + x*x6;
+      basis_x_eval[ipt + 1*npts] = x8*y;
+      basis_x_eval[ipt + 2*npts] = x8*z;
+      basis_x_eval[ipt + 3*npts] = x3*x9;
+      basis_x_eval[ipt + 4*npts] = x10;
+      basis_x_eval[ipt + 5*npts] = x5*x9;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y*z;
-      basis_y_eval[ipt + 3*npts] = y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*y*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x11;
+      basis_y_eval[ipt + 1*npts] = x*x13;
+      basis_y_eval[ipt + 2*npts] = x10;
+      basis_y_eval[ipt + 3*npts] = radial_eval_alpha*x14 + x6*y;
+      basis_y_eval[ipt + 4*npts] = x13*z;
+      basis_y_eval[ipt + 5*npts] = x11*x5;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*y*y*z;
-      basis_z_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = z*(2*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x15;
+      basis_z_eval[ipt + 1*npts] = x10;
+      basis_z_eval[ipt + 2*npts] = x*x17;
+      basis_z_eval[ipt + 3*npts] = x15*x3;
+      basis_z_eval[ipt + 4*npts] = x17*y;
+      basis_z_eval[ipt + 5*npts] = radial_eval_alpha*x18 + x6*z;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = 2*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z;
-      basis_lapl_eval[ipt + 1*npts] = x*y*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 2*npts] = x*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 3*npts] = 2*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z;
-      basis_lapl_eval[ipt + 4*npts] = y*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 5*npts] = 2*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z;
+      basis_lapl_eval[ipt + 0*npts] = x22 + x37 + x41;
+      basis_lapl_eval[ipt + 1*npts] = x1*x45;
+      basis_lapl_eval[ipt + 2*npts] = x2*x45;
+      basis_lapl_eval[ipt + 3*npts] = x25 + x39 + x43;
+      basis_lapl_eval[ipt + 4*npts] = x4*x45;
+      basis_lapl_eval[ipt + 5*npts] = x27 + x40 + x44;
+
 
 
 
@@ -161,17 +213,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x;
-      ang_eval_1 = radial_eval*x*y;
-      ang_eval_2 = radial_eval*x*z;
-      ang_eval_3 = radial_eval*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = radial_eval*x1;
+      ang_eval_2 = radial_eval*x2;
+      ang_eval_3 = radial_eval*x3;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z;
-      ang_eval_1 = radial_eval*z*z;
+      ang_eval_0 = radial_eval*x4;
+      ang_eval_1 = radial_eval*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
 
@@ -181,18 +233,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*z;
-      dang_eval_x_1 = y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*y*z;
-      dang_eval_x_2 = z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*y*z;
-      dang_eval_z_2 = x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = radial_eval_alpha*x*y*y;
-      dang_eval_y_3 = y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*x7 + x*x6;
+      dang_eval_y_0 = x0*x11;
+      dang_eval_z_0 = x0*x15;
+      dang_eval_x_1 = x8*y;
+      dang_eval_y_1 = x*x13;
+      dang_eval_z_1 = x10;
+      dang_eval_x_2 = x8*z;
+      dang_eval_y_2 = x10;
+      dang_eval_z_2 = x*x17;
+      dang_eval_x_3 = x3*x9;
+      dang_eval_y_3 = radial_eval_alpha*x14 + x6*y;
+      dang_eval_z_3 = x15*x3;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -206,12 +258,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*z;
-      dang_eval_y_0 = z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*z*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z*z;
-      dang_eval_z_1 = z*(2*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x10;
+      dang_eval_y_0 = x13*z;
+      dang_eval_z_0 = x17*y;
+      dang_eval_x_1 = x5*x9;
+      dang_eval_y_1 = x11*x5;
+      dang_eval_z_1 = radial_eval_alpha*x18 + x6*z;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp
index 9b180257..38339b69 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,19 +96,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval*y; 
+      const auto x1 = x*x; 
+      const auto x2 = radial_eval*z; 
+      const auto x3 = radial_eval*x; 
+      const auto x4 = y*y; 
+      const auto x5 = z*z; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*(x*x*x);
+      basis_eval[ipt + 1*npts] = x0*x1;
+      basis_eval[ipt + 2*npts] = x1*x2;
+      basis_eval[ipt + 3*npts] = x3*x4;
+      basis_eval[ipt + 4*npts] = x*x0*z;
+      basis_eval[ipt + 5*npts] = x3*x5;
+      basis_eval[ipt + 6*npts] = radial_eval*(y*y*y);
+      basis_eval[ipt + 7*npts] = x2*x4;
+      basis_eval[ipt + 8*npts] = x0*x5;
+      basis_eval[ipt + 9*npts] = radial_eval*(z*z*z);
 
 
     
@@ -114,6 +124,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
@@ -125,26 +137,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x;
-      ang_eval_1 = radial_eval*x*x*y;
-      ang_eval_2 = radial_eval*x*x*z;
-      ang_eval_3 = radial_eval*x*y*y;
+      ang_eval_0 = radial_eval*(x*x*x);
+      ang_eval_1 = x0*x1;
+      ang_eval_2 = x1*x2;
+      ang_eval_3 = x3*x4;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z;
-      ang_eval_1 = radial_eval*x*z*z;
-      ang_eval_2 = radial_eval*y*y*y;
-      ang_eval_3 = radial_eval*y*y*z;
+      ang_eval_0 = x*x0*z;
+      ang_eval_1 = x3*x5;
+      ang_eval_2 = radial_eval*(y*y*y);
+      ang_eval_3 = x2*x4;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z*z;
-      ang_eval_1 = radial_eval*z*z*z;
+      ang_eval_0 = x0*x5;
+      ang_eval_1 = radial_eval*(z*z*z);
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp
index 459f84e9..633e1bb8 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_3(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_3(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,57 +102,87 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = radial_eval*x; 
+      const auto x5 = y*y; 
+      const auto x6 = x*z; 
+      const auto x7 = z*z; 
+      const auto x8 = y*y*y; 
+      const auto x9 = z*z*z; 
+      const auto x10 = 3.0*radial_eval; 
+      const auto x11 = radial_eval_alpha*x0 + 2.0*x4; 
+      const auto x12 = radial_eval*x5; 
+      const auto x13 = radial_eval_alpha*x2*x5; 
+      const auto x14 = y*z; 
+      const auto x15 = radial_eval*x7; 
+      const auto x16 = radial_eval_alpha*x2*x7; 
+      const auto x17 = radial_eval_alpha*x; 
+      const auto x18 = x17*x5*z; 
+      const auto x19 = x17*x7*y; 
+      const auto x20 = radial_eval_alpha*y; 
+      const auto x21 = radial_eval*x2; 
+      const auto x22 = radial_eval_alpha*x14*x2; 
+      const auto x23 = radial_eval_alpha*x8 + 2.0*x1; 
+      const auto x24 = radial_eval_alpha*x5*x7; 
+      const auto x25 = radial_eval_alpha*z; 
+      const auto x26 = radial_eval_alpha*x9 + 2.0*x3; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = x4*x5;
+      basis_eval[ipt + 4*npts] = x1*x6;
+      basis_eval[ipt + 5*npts] = x4*x7;
+      basis_eval[ipt + 6*npts] = radial_eval*x8;
+      basis_eval[ipt + 7*npts] = x3*x5;
+      basis_eval[ipt + 8*npts] = x1*x7;
+      basis_eval[ipt + 9*npts] = radial_eval*x9;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = y*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 5*npts] = z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y;
-      basis_x_eval[ipt + 7*npts] = radial_eval_alpha*x*y*y*z;
-      basis_x_eval[ipt + 8*npts] = radial_eval_alpha*x*y*z*z;
-      basis_x_eval[ipt + 9*npts] = radial_eval_alpha*x*z*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*(x*x*x*x) + x10*x2;
+      basis_x_eval[ipt + 1*npts] = x11*y;
+      basis_x_eval[ipt + 2*npts] = x11*z;
+      basis_x_eval[ipt + 3*npts] = x12 + x13;
+      basis_x_eval[ipt + 4*npts] = x14*(radial_eval + radial_eval_alpha*x2);
+      basis_x_eval[ipt + 5*npts] = x15 + x16;
+      basis_x_eval[ipt + 6*npts] = x17*x8;
+      basis_x_eval[ipt + 7*npts] = x18;
+      basis_x_eval[ipt + 8*npts] = x19;
+      basis_x_eval[ipt + 9*npts] = x17*x9;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*y*z;
-      basis_y_eval[ipt + 3*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*y*z*z;
-      basis_y_eval[ipt + 6*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 7*npts] = y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 9*npts] = radial_eval_alpha*y*z*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x20;
+      basis_y_eval[ipt + 1*npts] = x13 + x21;
+      basis_y_eval[ipt + 2*npts] = x22;
+      basis_y_eval[ipt + 3*npts] = x*x23;
+      basis_y_eval[ipt + 4*npts] = x6*(radial_eval + radial_eval_alpha*x5);
+      basis_y_eval[ipt + 5*npts] = x19;
+      basis_y_eval[ipt + 6*npts] = radial_eval_alpha*(y*y*y*y) + x10*x5;
+      basis_y_eval[ipt + 7*npts] = x23*z;
+      basis_y_eval[ipt + 8*npts] = x15 + x24;
+      basis_y_eval[ipt + 9*npts] = x20*x9;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y*z;
-      basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 6*npts] = radial_eval_alpha*y*y*y*z;
-      basis_z_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 8*npts] = y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 9*npts] = z*z*(3*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x25;
+      basis_z_eval[ipt + 1*npts] = x22;
+      basis_z_eval[ipt + 2*npts] = x16 + x21;
+      basis_z_eval[ipt + 3*npts] = x18;
+      basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*x7);
+      basis_z_eval[ipt + 5*npts] = x*x26;
+      basis_z_eval[ipt + 6*npts] = x25*x8;
+      basis_z_eval[ipt + 7*npts] = x12 + x24;
+      basis_z_eval[ipt + 8*npts] = x26*y;
+      basis_z_eval[ipt + 9*npts] = radial_eval_alpha*(z*z*z*z) + x10*x7;
+
+
 
 
 
@@ -166,26 +199,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x;
-      ang_eval_1 = radial_eval*x*x*y;
-      ang_eval_2 = radial_eval*x*x*z;
-      ang_eval_3 = radial_eval*x*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = x4*x5;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z;
-      ang_eval_1 = radial_eval*x*z*z;
-      ang_eval_2 = radial_eval*y*y*y;
-      ang_eval_3 = radial_eval*y*y*z;
+      ang_eval_0 = x1*x6;
+      ang_eval_1 = x4*x7;
+      ang_eval_2 = radial_eval*x8;
+      ang_eval_3 = x3*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z*z;
-      ang_eval_1 = radial_eval*z*z*z;
+      ang_eval_0 = x1*x7;
+      ang_eval_1 = radial_eval*x9;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
 
@@ -195,18 +228,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*x*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*x*z;
-      dang_eval_x_1 = x*y*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*x*y*z;
-      dang_eval_x_2 = x*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*x*y*z;
-      dang_eval_z_2 = x*x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = y*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*x*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*(x*x*x*x) + x10*x2;
+      dang_eval_y_0 = x0*x20;
+      dang_eval_z_0 = x0*x25;
+      dang_eval_x_1 = x11*y;
+      dang_eval_y_1 = x13 + x21;
+      dang_eval_z_1 = x22;
+      dang_eval_x_2 = x11*z;
+      dang_eval_y_2 = x22;
+      dang_eval_z_2 = x16 + x21;
+      dang_eval_x_3 = x12 + x13;
+      dang_eval_y_3 = x*x23;
+      dang_eval_z_3 = x18;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -220,18 +253,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*y*z*z;
-      dang_eval_z_1 = x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*y*y*y;
-      dang_eval_y_2 = y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*y*y*y*z;
-      dang_eval_x_3 = radial_eval_alpha*x*y*y*z;
-      dang_eval_y_3 = y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x14*(radial_eval + radial_eval_alpha*x2);
+      dang_eval_y_0 = x6*(radial_eval + radial_eval_alpha*x5);
+      dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*x7);
+      dang_eval_x_1 = x15 + x16;
+      dang_eval_y_1 = x19;
+      dang_eval_z_1 = x*x26;
+      dang_eval_x_2 = x17*x8;
+      dang_eval_y_2 = radial_eval_alpha*(y*y*y*y) + x10*x5;
+      dang_eval_z_2 = x25*x8;
+      dang_eval_x_3 = x18;
+      dang_eval_y_3 = x23*z;
+      dang_eval_z_3 = x12 + x24;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -245,12 +278,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*z*z;
-      dang_eval_y_0 = z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*z*z*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z*z*z;
-      dang_eval_z_1 = z*z*(3*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x19;
+      dang_eval_y_0 = x15 + x24;
+      dang_eval_z_0 = x26*y;
+      dang_eval_x_1 = x17*x9;
+      dang_eval_y_1 = x20*x9;
+      dang_eval_z_1 = radial_eval_alpha*(z*z*z*z) + x10*x7;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp
index 31178f04..6ce4a6c3 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_3(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_3(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,129 +111,225 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = radial_eval*x; 
+      const auto x5 = y*y; 
+      const auto x6 = x*z; 
+      const auto x7 = z*z; 
+      const auto x8 = y*y*y; 
+      const auto x9 = z*z*z; 
+      const auto x10 = x*x*x*x; 
+      const auto x11 = 3.0*radial_eval; 
+      const auto x12 = radial_eval_alpha*x0 + 2.0*x4; 
+      const auto x13 = radial_eval*x5; 
+      const auto x14 = x2*x5; 
+      const auto x15 = radial_eval_alpha*x14; 
+      const auto x16 = y*z; 
+      const auto x17 = radial_eval_alpha*x2; 
+      const auto x18 = radial_eval + x17; 
+      const auto x19 = radial_eval*x7; 
+      const auto x20 = x2*x7; 
+      const auto x21 = radial_eval_alpha*x20; 
+      const auto x22 = radial_eval_alpha*x; 
+      const auto x23 = x22*x5*z; 
+      const auto x24 = x22*x7*y; 
+      const auto x25 = radial_eval_alpha*y; 
+      const auto x26 = radial_eval*x2; 
+      const auto x27 = radial_eval_alpha*x16*x2; 
+      const auto x28 = radial_eval_alpha*x8 + 2.0*x1; 
+      const auto x29 = radial_eval_alpha*x5; 
+      const auto x30 = radial_eval + x29; 
+      const auto x31 = y*y*y*y; 
+      const auto x32 = x5*x7; 
+      const auto x33 = radial_eval_alpha*x32; 
+      const auto x34 = radial_eval_alpha*z; 
+      const auto x35 = x*y; 
+      const auto x36 = radial_eval_alpha*x7; 
+      const auto x37 = radial_eval_alpha*x9 + 2.0*x3; 
+      const auto x38 = z*z*z*z; 
+      const auto x39 = 6.0*radial_eval_alpha; 
+      const auto x40 = radial_eval_alpha_squared*x2; 
+      const auto x41 = radial_eval_alpha + x40; 
+      const auto x42 = x0*x39 + x0*x41 + 6.0*x4; 
+      const auto x43 = 4.0*radial_eval_alpha; 
+      const auto x44 = 2.0*radial_eval; 
+      const auto x45 = x2*x41 + x44; 
+      const auto x46 = x2*x43 + x45; 
+      const auto x47 = 2.0*radial_eval_alpha; 
+      const auto x48 = x47*x5; 
+      const auto x49 = x41*x5; 
+      const auto x50 = x*x16; 
+      const auto x51 = 3.0*radial_eval_alpha; 
+      const auto x52 = x47*x7; 
+      const auto x53 = x41*x7; 
+      const auto x54 = x41*x8; 
+      const auto x55 = x41*x9; 
+      const auto x56 = radial_eval_alpha_squared*x10 + x2*x51; 
+      const auto x57 = 2.0*x22; 
+      const auto x58 = x16*(radial_eval_alpha_squared*x0 + x57); 
+      const auto x59 = 2.0*x25; 
+      const auto x60 = radial_eval_alpha_squared*x14; 
+      const auto x61 = x29 + x60; 
+      const auto x62 = radial_eval_alpha_squared*x20; 
+      const auto x63 = x36 + x62; 
+      const auto x64 = radial_eval_alpha_squared*x31 + x5*x51; 
+      const auto x65 = x6*(radial_eval_alpha_squared*x8 + x59); 
+      const auto x66 = radial_eval_alpha_squared*x32; 
+      const auto x67 = x36 + x66; 
+      const auto x68 = 2.0*x34; 
+      const auto x69 = x35*(radial_eval_alpha_squared*x9 + x68); 
+      const auto x70 = radial_eval_alpha_squared*x38 + x51*x7; 
+      const auto x71 = radial_eval_alpha_squared*x5; 
+      const auto x72 = radial_eval_alpha + x71; 
+      const auto x73 = x0*x72; 
+      const auto x74 = x2*x47; 
+      const auto x75 = x2*x72; 
+      const auto x76 = x44 + x5*x72; 
+      const auto x77 = x43*x5 + x76; 
+      const auto x78 = x7*x72; 
+      const auto x79 = 6.0*x1 + x39*x8 + x72*x8; 
+      const auto x80 = x72*x9; 
+      const auto x81 = radial_eval_alpha_squared*x7; 
+      const auto x82 = radial_eval_alpha + x81; 
+      const auto x83 = x0*x82; 
+      const auto x84 = x2*x82; 
+      const auto x85 = x5*x82; 
+      const auto x86 = x44 + x7*x82; 
+      const auto x87 = x43*x7 + x86; 
+      const auto x88 = x8*x82; 
+      const auto x89 = 6.0*x3 + x39*x9 + x82*x9; 
+      const auto x90 = x2*x39 + x45 + x75 + x84; 
+      const auto x91 = x39*x5 + x49 + x76 + x85; 
+      const auto x92 = x39*x7 + x53 + x78 + x86; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = x4*x5;
+      basis_eval[ipt + 4*npts] = x1*x6;
+      basis_eval[ipt + 5*npts] = x4*x7;
+      basis_eval[ipt + 6*npts] = radial_eval*x8;
+      basis_eval[ipt + 7*npts] = x3*x5;
+      basis_eval[ipt + 8*npts] = x1*x7;
+      basis_eval[ipt + 9*npts] = radial_eval*x9;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = y*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 5*npts] = z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y;
-      basis_x_eval[ipt + 7*npts] = radial_eval_alpha*x*y*y*z;
-      basis_x_eval[ipt + 8*npts] = radial_eval_alpha*x*y*z*z;
-      basis_x_eval[ipt + 9*npts] = radial_eval_alpha*x*z*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x10 + x11*x2;
+      basis_x_eval[ipt + 1*npts] = x12*y;
+      basis_x_eval[ipt + 2*npts] = x12*z;
+      basis_x_eval[ipt + 3*npts] = x13 + x15;
+      basis_x_eval[ipt + 4*npts] = x16*x18;
+      basis_x_eval[ipt + 5*npts] = x19 + x21;
+      basis_x_eval[ipt + 6*npts] = x22*x8;
+      basis_x_eval[ipt + 7*npts] = x23;
+      basis_x_eval[ipt + 8*npts] = x24;
+      basis_x_eval[ipt + 9*npts] = x22*x9;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*y*z;
-      basis_y_eval[ipt + 3*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*y*z*z;
-      basis_y_eval[ipt + 6*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 7*npts] = y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 9*npts] = radial_eval_alpha*y*z*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x25;
+      basis_y_eval[ipt + 1*npts] = x15 + x26;
+      basis_y_eval[ipt + 2*npts] = x27;
+      basis_y_eval[ipt + 3*npts] = x*x28;
+      basis_y_eval[ipt + 4*npts] = x30*x6;
+      basis_y_eval[ipt + 5*npts] = x24;
+      basis_y_eval[ipt + 6*npts] = radial_eval_alpha*x31 + x11*x5;
+      basis_y_eval[ipt + 7*npts] = x28*z;
+      basis_y_eval[ipt + 8*npts] = x19 + x33;
+      basis_y_eval[ipt + 9*npts] = x25*x9;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y*z;
-      basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 6*npts] = radial_eval_alpha*y*y*y*z;
-      basis_z_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 8*npts] = y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 9*npts] = z*z*(3*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x34;
+      basis_z_eval[ipt + 1*npts] = x27;
+      basis_z_eval[ipt + 2*npts] = x21 + x26;
+      basis_z_eval[ipt + 3*npts] = x23;
+      basis_z_eval[ipt + 4*npts] = x35*(radial_eval + x36);
+      basis_z_eval[ipt + 5*npts] = x*x37;
+      basis_z_eval[ipt + 6*npts] = x34*x8;
+      basis_z_eval[ipt + 7*npts] = x13 + x33;
+      basis_z_eval[ipt + 8*npts] = x37*y;
+      basis_z_eval[ipt + 9*npts] = radial_eval_alpha*x38 + x11*x7;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = x*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 1*npts] = y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 2*npts] = z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 3*npts] = x*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 4*npts] = x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 5*npts] = x*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 6*npts] = y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 7*npts] = y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 8*npts] = y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 9*npts] = z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xx_eval[ipt + 0*npts] = x42;
+      basis_xx_eval[ipt + 1*npts] = x46*y;
+      basis_xx_eval[ipt + 2*npts] = x46*z;
+      basis_xx_eval[ipt + 3*npts] = x*(x48 + x49);
+      basis_xx_eval[ipt + 4*npts] = x50*(x40 + x51);
+      basis_xx_eval[ipt + 5*npts] = x*(x52 + x53);
+      basis_xx_eval[ipt + 6*npts] = x54;
+      basis_xx_eval[ipt + 7*npts] = x49*z;
+      basis_xx_eval[ipt + 8*npts] = x53*y;
+      basis_xx_eval[ipt + 9*npts] = x55;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 1*npts] = x*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 2*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 3*npts] = y*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 5*npts] = y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 6*npts] = x*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 7*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 8*npts] = x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 9*npts] = radial_eval_alpha_squared*x*y*z*z*z;
+      basis_xy_eval[ipt + 0*npts] = x56*y;
+      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x5 + x12 + x5*x57;
+      basis_xy_eval[ipt + 2*npts] = x58;
+      basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x2*x8 + x2*x59 + x28;
+      basis_xy_eval[ipt + 4*npts] = z*(x18 + x61);
+      basis_xy_eval[ipt + 5*npts] = x63*y;
+      basis_xy_eval[ipt + 6*npts] = x*x64;
+      basis_xy_eval[ipt + 7*npts] = x65;
+      basis_xy_eval[ipt + 8*npts] = x*x67;
+      basis_xy_eval[ipt + 9*npts] = radial_eval_alpha_squared*x35*x9;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 1*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 2*npts] = x*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 3*npts] = y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 5*npts] = z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 6*npts] = radial_eval_alpha_squared*x*y*y*y*z;
-      basis_xz_eval[ipt + 7*npts] = x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 8*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 9*npts] = x*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_xz_eval[ipt + 0*npts] = x56*z;
+      basis_xz_eval[ipt + 1*npts] = x58;
+      basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x7 + x12 + x57*x7;
+      basis_xz_eval[ipt + 3*npts] = x61*z;
+      basis_xz_eval[ipt + 4*npts] = y*(x18 + x63);
+      basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x2*x9 + x2*x68 + x37;
+      basis_xz_eval[ipt + 6*npts] = radial_eval_alpha_squared*x6*x8;
+      basis_xz_eval[ipt + 7*npts] = x*(x29 + x66);
+      basis_xz_eval[ipt + 8*npts] = x69;
+      basis_xz_eval[ipt + 9*npts] = x*x70;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 1*npts] = x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 3*npts] = x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 4*npts] = x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 5*npts] = x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 6*npts] = y*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 7*npts] = z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 8*npts] = y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 9*npts] = z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
+      basis_yy_eval[ipt + 0*npts] = x73;
+      basis_yy_eval[ipt + 1*npts] = y*(x74 + x75);
+      basis_yy_eval[ipt + 2*npts] = x75*z;
+      basis_yy_eval[ipt + 3*npts] = x*x77;
+      basis_yy_eval[ipt + 4*npts] = x50*(x51 + x71);
+      basis_yy_eval[ipt + 5*npts] = x*x78;
+      basis_yy_eval[ipt + 6*npts] = x79;
+      basis_yy_eval[ipt + 7*npts] = x77*z;
+      basis_yy_eval[ipt + 8*npts] = y*(x52 + x78);
+      basis_yy_eval[ipt + 9*npts] = x80;
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*x*y*z;
-      basis_yz_eval[ipt + 1*npts] = x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 2*npts] = x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 3*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 4*npts] = x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 5*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 6*npts] = y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 7*npts] = y*(2*radial_eval + radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 8*npts] = z*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 9*npts] = y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x16;
+      basis_yz_eval[ipt + 1*npts] = z*(x17 + x60);
+      basis_yz_eval[ipt + 2*npts] = y*(x17 + x62);
+      basis_yz_eval[ipt + 3*npts] = x65;
+      basis_yz_eval[ipt + 4*npts] = x*(x30 + x67);
+      basis_yz_eval[ipt + 5*npts] = x69;
+      basis_yz_eval[ipt + 6*npts] = x64*z;
+      basis_yz_eval[ipt + 7*npts] = radial_eval_alpha_squared*x7*x8 + x28 + x59*x7;
+      basis_yz_eval[ipt + 8*npts] = radial_eval_alpha_squared*x5*x9 + x37 + x5*x68;
+      basis_yz_eval[ipt + 9*npts] = x70*y;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 1*npts] = x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 3*npts] = x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 4*npts] = x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 5*npts] = x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 6*npts] = y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 7*npts] = y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 8*npts] = y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 9*npts] = z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
+      basis_zz_eval[ipt + 0*npts] = x83;
+      basis_zz_eval[ipt + 1*npts] = x84*y;
+      basis_zz_eval[ipt + 2*npts] = z*(x74 + x84);
+      basis_zz_eval[ipt + 3*npts] = x*x85;
+      basis_zz_eval[ipt + 4*npts] = x50*(x51 + x81);
+      basis_zz_eval[ipt + 5*npts] = x*x87;
+      basis_zz_eval[ipt + 6*npts] = x88;
+      basis_zz_eval[ipt + 7*npts] = z*(x48 + x85);
+      basis_zz_eval[ipt + 8*npts] = x87*y;
+      basis_zz_eval[ipt + 9*npts] = x89;
+
+
 
 
 
@@ -246,26 +345,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x;
-      ang_eval_1 = radial_eval*x*x*y;
-      ang_eval_2 = radial_eval*x*x*z;
-      ang_eval_3 = radial_eval*x*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = x4*x5;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z;
-      ang_eval_1 = radial_eval*x*z*z;
-      ang_eval_2 = radial_eval*y*y*y;
-      ang_eval_3 = radial_eval*y*y*z;
+      ang_eval_0 = x1*x6;
+      ang_eval_1 = x4*x7;
+      ang_eval_2 = radial_eval*x8;
+      ang_eval_3 = x3*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z*z;
-      ang_eval_1 = radial_eval*z*z*z;
+      ang_eval_0 = x1*x7;
+      ang_eval_1 = radial_eval*x9;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
 
@@ -275,18 +374,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*x*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*x*z;
-      dang_eval_x_1 = x*y*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*x*y*z;
-      dang_eval_x_2 = x*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*x*y*z;
-      dang_eval_z_2 = x*x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = y*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*x*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*x10 + x11*x2;
+      dang_eval_y_0 = x0*x25;
+      dang_eval_z_0 = x0*x34;
+      dang_eval_x_1 = x12*y;
+      dang_eval_y_1 = x15 + x26;
+      dang_eval_z_1 = x27;
+      dang_eval_x_2 = x12*z;
+      dang_eval_y_2 = x27;
+      dang_eval_z_2 = x21 + x26;
+      dang_eval_x_3 = x13 + x15;
+      dang_eval_y_3 = x*x28;
+      dang_eval_z_3 = x23;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -300,18 +399,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*y*z*z;
-      dang_eval_z_1 = x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*y*y*y;
-      dang_eval_y_2 = y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*y*y*y*z;
-      dang_eval_x_3 = radial_eval_alpha*x*y*y*z;
-      dang_eval_y_3 = y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x16*x18;
+      dang_eval_y_0 = x30*x6;
+      dang_eval_z_0 = x35*(radial_eval + x36);
+      dang_eval_x_1 = x19 + x21;
+      dang_eval_y_1 = x24;
+      dang_eval_z_1 = x*x37;
+      dang_eval_x_2 = x22*x8;
+      dang_eval_y_2 = radial_eval_alpha*x31 + x11*x5;
+      dang_eval_z_2 = x34*x8;
+      dang_eval_x_3 = x23;
+      dang_eval_y_3 = x28*z;
+      dang_eval_z_3 = x13 + x33;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -325,12 +424,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*z*z;
-      dang_eval_y_0 = z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*z*z*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z*z*z;
-      dang_eval_z_1 = z*z*(3*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x24;
+      dang_eval_y_0 = x19 + x33;
+      dang_eval_z_0 = x37*y;
+      dang_eval_x_1 = x22*x9;
+      dang_eval_y_1 = x25*x9;
+      dang_eval_z_1 = radial_eval_alpha*x38 + x11*x7;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp
new file mode 100644
index 00000000..ebeee17b
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp
@@ -0,0 +1,565 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_3(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x; 
+      const auto x3 = x2; 
+      const auto x4 = radial_eval*z; 
+      const auto x5 = radial_eval*x; 
+      const auto x6 = y*y; 
+      const auto x7 = x6; 
+      const auto x8 = x*z; 
+      const auto x9 = z*z; 
+      const auto x10 = x9; 
+      const auto x11 = y*y*y; 
+      const auto x12 = z*z*z; 
+      const auto x13 = x*x*x*x; 
+      const auto x14 = 3.0*radial_eval; 
+      const auto x15 = radial_eval_alpha*x0 + 2.0*x5; 
+      const auto x16 = radial_eval*x7; 
+      const auto x17 = x3*x7; 
+      const auto x18 = radial_eval_alpha*x17; 
+      const auto x19 = y*z; 
+      const auto x20 = radial_eval_alpha*x3; 
+      const auto x21 = radial_eval + x20; 
+      const auto x22 = radial_eval*x10; 
+      const auto x23 = x10*x3; 
+      const auto x24 = radial_eval_alpha*x23; 
+      const auto x25 = radial_eval_alpha*x; 
+      const auto x26 = x25*x7*z; 
+      const auto x27 = x10*x25*y; 
+      const auto x28 = radial_eval_alpha*y; 
+      const auto x29 = radial_eval*x3; 
+      const auto x30 = radial_eval_alpha*x19*x3; 
+      const auto x31 = radial_eval_alpha*x11 + 2.0*x1; 
+      const auto x32 = radial_eval_alpha*x7; 
+      const auto x33 = radial_eval + x32; 
+      const auto x34 = y*y*y*y; 
+      const auto x35 = x10*x7; 
+      const auto x36 = radial_eval_alpha*x35; 
+      const auto x37 = radial_eval_alpha*z; 
+      const auto x38 = x*y; 
+      const auto x39 = radial_eval_alpha*x10; 
+      const auto x40 = radial_eval_alpha*x12 + 2.0*x4; 
+      const auto x41 = z*z*z*z; 
+      const auto x42 = 6.0*radial_eval_alpha; 
+      const auto x43 = radial_eval_alpha_squared*x3; 
+      const auto x44 = radial_eval_alpha + x43; 
+      const auto x45 = x0*x42 + x0*x44 + 6.0*x5; 
+      const auto x46 = 4.0*radial_eval_alpha; 
+      const auto x47 = 2.0*radial_eval; 
+      const auto x48 = x3*x44; 
+      const auto x49 = x47 + x48; 
+      const auto x50 = x3*x46 + x49; 
+      const auto x51 = 2.0*radial_eval_alpha; 
+      const auto x52 = x51*x7; 
+      const auto x53 = x44*x7; 
+      const auto x54 = x*x19; 
+      const auto x55 = 3.0*radial_eval_alpha; 
+      const auto x56 = x10*x51; 
+      const auto x57 = x10*x44; 
+      const auto x58 = x11*x44; 
+      const auto x59 = x12*x44; 
+      const auto x60 = radial_eval_alpha_squared*x13 + x3*x55; 
+      const auto x61 = 2.0*x25; 
+      const auto x62 = x19*(radial_eval_alpha_squared*x0 + x61); 
+      const auto x63 = 2.0*x28; 
+      const auto x64 = radial_eval_alpha_squared*x17; 
+      const auto x65 = x32 + x64; 
+      const auto x66 = radial_eval_alpha_squared*x23; 
+      const auto x67 = x39 + x66; 
+      const auto x68 = radial_eval_alpha_squared*x34 + x55*x7; 
+      const auto x69 = x8*(radial_eval_alpha_squared*x11 + x63); 
+      const auto x70 = radial_eval_alpha_squared*x35; 
+      const auto x71 = x39 + x70; 
+      const auto x72 = 2.0*x37; 
+      const auto x73 = x38*(radial_eval_alpha_squared*x12 + x72); 
+      const auto x74 = radial_eval_alpha_squared*x41 + x10*x55; 
+      const auto x75 = radial_eval_alpha_squared*x7; 
+      const auto x76 = radial_eval_alpha + x75; 
+      const auto x77 = x0*x76; 
+      const auto x78 = x3*x51; 
+      const auto x79 = x3*x76; 
+      const auto x80 = x7*x76; 
+      const auto x81 = x47 + x80; 
+      const auto x82 = x46*x7 + x81; 
+      const auto x83 = x10*x76; 
+      const auto x84 = 6.0*x1 + x11*x42 + x11*x76; 
+      const auto x85 = x12*x76; 
+      const auto x86 = radial_eval_alpha_squared*x10; 
+      const auto x87 = radial_eval_alpha + x86; 
+      const auto x88 = x0*x87; 
+      const auto x89 = x3*x87; 
+      const auto x90 = x7*x87; 
+      const auto x91 = x10*x87; 
+      const auto x92 = x47 + x91; 
+      const auto x93 = x10*x46 + x92; 
+      const auto x94 = x11*x87; 
+      const auto x95 = x12*x42 + x12*x87 + 6.0*x4; 
+      const auto x96 = x3*x42 + x49 + x79 + x89; 
+      const auto x97 = x42*x7 + x53 + x81 + x90; 
+      const auto x98 = x75 + x86; 
+      const auto x99 = x10*x42 + x57 + x83 + x92; 
+      const auto x100 = 6.0*radial_eval; 
+      const auto x101 = 18.0*radial_eval_alpha; 
+      const auto x102 = 3.0*x79; 
+      const auto x103 = 3.0*x89; 
+      const auto x104 = radial_eval_alpha_cubed*x7 + radial_eval_alpha_squared; 
+      const auto x105 = x0*x104; 
+      const auto x106 = radial_eval_alpha_cubed*x10 + radial_eval_alpha_squared; 
+      const auto x107 = x0*x106; 
+      const auto x108 = 3.0*radial_eval_alpha_squared; 
+      const auto x109 = radial_eval_alpha_cubed*x0 + x*x108; 
+      const auto x110 = 2.0*radial_eval_alpha_squared; 
+      const auto x111 = 6.0*x; 
+      const auto x112 = 2.0*x; 
+      const auto x113 = x104*x3; 
+      const auto x114 = x106*x3; 
+      const auto x115 = x*x113 + x*x114 + x0*x110 + x109*x3 + x111*x44 + x112*x76 + x112*x87 + 10.0*x25; 
+      const auto x116 = 3.0*x53; 
+      const auto x117 = x109*x7; 
+      const auto x118 = x104*x7; 
+      const auto x119 = x106*x7; 
+      const auto x120 = 4.0*radial_eval_alpha_squared; 
+      const auto x121 = x120*x17; 
+      const auto x122 = 3.0*x57; 
+      const auto x123 = x10*x109; 
+      const auto x124 = x10*x104; 
+      const auto x125 = x10*x106; 
+      const auto x126 = x120*x23; 
+      const auto x127 = 6.0*y; 
+      const auto x128 = x127*x25; 
+      const auto x129 = radial_eval_alpha_squared*x111; 
+      const auto x130 = x104*x11; 
+      const auto x131 = x106*x11; 
+      const auto x132 = 6.0*z; 
+      const auto x133 = x132*x25; 
+      const auto x134 = x104*x12; 
+      const auto x135 = x106*x12; 
+      const auto x136 = radial_eval_alpha_squared*x127; 
+      const auto x137 = radial_eval_alpha_cubed*x3 + radial_eval_alpha_squared; 
+      const auto x138 = x0*x137; 
+      const auto x139 = radial_eval_alpha_cubed*x11 + x108*y; 
+      const auto x140 = x139*x3; 
+      const auto x141 = x137*x3; 
+      const auto x142 = 2.0*y; 
+      const auto x143 = x137*x7; 
+      const auto x144 = x11*x110 + x119*y + x127*x76 + x139*x7 + x142*x44 + x142*x87 + x143*y + 10.0*x28; 
+      const auto x145 = x42 + x43; 
+      const auto x146 = x10*x137; 
+      const auto x147 = x10*x139; 
+      const auto x148 = 3.0*x90; 
+      const auto x149 = x11*x137; 
+      const auto x150 = 3.0*x83; 
+      const auto x151 = x120*x35; 
+      const auto x152 = x19*x42; 
+      const auto x153 = x12*x137; 
+      const auto x154 = radial_eval_alpha_squared*x132; 
+      const auto x155 = radial_eval_alpha_cubed*x12 + x108*z; 
+      const auto x156 = x155*x3; 
+      const auto x157 = x155*x7; 
+      const auto x158 = 2.0*z; 
+      const auto x159 = x10*x155 + x110*x12 + x124*z + x132*x87 + x146*z + x158*x44 + x158*x76 + 10.0*x37; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x3;
+      basis_eval[ipt + 2*npts] = x3*x4;
+      basis_eval[ipt + 3*npts] = x5*x7;
+      basis_eval[ipt + 4*npts] = x1*x8;
+      basis_eval[ipt + 5*npts] = x10*x5;
+      basis_eval[ipt + 6*npts] = radial_eval*x11;
+      basis_eval[ipt + 7*npts] = x4*x7;
+      basis_eval[ipt + 8*npts] = x1*x10;
+      basis_eval[ipt + 9*npts] = radial_eval*x12;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x13 + x14*x3;
+      basis_x_eval[ipt + 1*npts] = x15*y;
+      basis_x_eval[ipt + 2*npts] = x15*z;
+      basis_x_eval[ipt + 3*npts] = x16 + x18;
+      basis_x_eval[ipt + 4*npts] = x19*x21;
+      basis_x_eval[ipt + 5*npts] = x22 + x24;
+      basis_x_eval[ipt + 6*npts] = x11*x25;
+      basis_x_eval[ipt + 7*npts] = x26;
+      basis_x_eval[ipt + 8*npts] = x27;
+      basis_x_eval[ipt + 9*npts] = x12*x25;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x0*x28;
+      basis_y_eval[ipt + 1*npts] = x18 + x29;
+      basis_y_eval[ipt + 2*npts] = x30;
+      basis_y_eval[ipt + 3*npts] = x*x31;
+      basis_y_eval[ipt + 4*npts] = x33*x8;
+      basis_y_eval[ipt + 5*npts] = x27;
+      basis_y_eval[ipt + 6*npts] = radial_eval_alpha*x34 + x14*x7;
+      basis_y_eval[ipt + 7*npts] = x31*z;
+      basis_y_eval[ipt + 8*npts] = x22 + x36;
+      basis_y_eval[ipt + 9*npts] = x12*x28;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x0*x37;
+      basis_z_eval[ipt + 1*npts] = x30;
+      basis_z_eval[ipt + 2*npts] = x24 + x29;
+      basis_z_eval[ipt + 3*npts] = x26;
+      basis_z_eval[ipt + 4*npts] = x38*(radial_eval + x39);
+      basis_z_eval[ipt + 5*npts] = x*x40;
+      basis_z_eval[ipt + 6*npts] = x11*x37;
+      basis_z_eval[ipt + 7*npts] = x16 + x36;
+      basis_z_eval[ipt + 8*npts] = x40*y;
+      basis_z_eval[ipt + 9*npts] = radial_eval_alpha*x41 + x10*x14;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x45;
+      basis_xx_eval[ipt + 1*npts] = x50*y;
+      basis_xx_eval[ipt + 2*npts] = x50*z;
+      basis_xx_eval[ipt + 3*npts] = x*(x52 + x53);
+      basis_xx_eval[ipt + 4*npts] = x54*(x43 + x55);
+      basis_xx_eval[ipt + 5*npts] = x*(x56 + x57);
+      basis_xx_eval[ipt + 6*npts] = x58;
+      basis_xx_eval[ipt + 7*npts] = x53*z;
+      basis_xx_eval[ipt + 8*npts] = x57*y;
+      basis_xx_eval[ipt + 9*npts] = x59;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x60*y;
+      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x7 + x15 + x61*x7;
+      basis_xy_eval[ipt + 2*npts] = x62;
+      basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x11*x3 + x3*x63 + x31;
+      basis_xy_eval[ipt + 4*npts] = z*(x21 + x65);
+      basis_xy_eval[ipt + 5*npts] = x67*y;
+      basis_xy_eval[ipt + 6*npts] = x*x68;
+      basis_xy_eval[ipt + 7*npts] = x69;
+      basis_xy_eval[ipt + 8*npts] = x*x71;
+      basis_xy_eval[ipt + 9*npts] = radial_eval_alpha_squared*x12*x38;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x60*z;
+      basis_xz_eval[ipt + 1*npts] = x62;
+      basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x10 + x10*x61 + x15;
+      basis_xz_eval[ipt + 3*npts] = x65*z;
+      basis_xz_eval[ipt + 4*npts] = y*(x21 + x67);
+      basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x12*x3 + x3*x72 + x40;
+      basis_xz_eval[ipt + 6*npts] = radial_eval_alpha_squared*x11*x8;
+      basis_xz_eval[ipt + 7*npts] = x*(x32 + x70);
+      basis_xz_eval[ipt + 8*npts] = x73;
+      basis_xz_eval[ipt + 9*npts] = x*x74;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x77;
+      basis_yy_eval[ipt + 1*npts] = y*(x78 + x79);
+      basis_yy_eval[ipt + 2*npts] = x79*z;
+      basis_yy_eval[ipt + 3*npts] = x*x82;
+      basis_yy_eval[ipt + 4*npts] = x54*(x55 + x75);
+      basis_yy_eval[ipt + 5*npts] = x*x83;
+      basis_yy_eval[ipt + 6*npts] = x84;
+      basis_yy_eval[ipt + 7*npts] = x82*z;
+      basis_yy_eval[ipt + 8*npts] = y*(x56 + x83);
+      basis_yy_eval[ipt + 9*npts] = x85;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x19;
+      basis_yz_eval[ipt + 1*npts] = z*(x20 + x64);
+      basis_yz_eval[ipt + 2*npts] = y*(x20 + x66);
+      basis_yz_eval[ipt + 3*npts] = x69;
+      basis_yz_eval[ipt + 4*npts] = x*(x33 + x71);
+      basis_yz_eval[ipt + 5*npts] = x73;
+      basis_yz_eval[ipt + 6*npts] = x68*z;
+      basis_yz_eval[ipt + 7*npts] = radial_eval_alpha_squared*x10*x11 + x10*x63 + x31;
+      basis_yz_eval[ipt + 8*npts] = radial_eval_alpha_squared*x12*x7 + x40 + x7*x72;
+      basis_yz_eval[ipt + 9*npts] = x74*y;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x88;
+      basis_zz_eval[ipt + 1*npts] = x89*y;
+      basis_zz_eval[ipt + 2*npts] = z*(x78 + x89);
+      basis_zz_eval[ipt + 3*npts] = x*x90;
+      basis_zz_eval[ipt + 4*npts] = x54*(x55 + x86);
+      basis_zz_eval[ipt + 5*npts] = x*x93;
+      basis_zz_eval[ipt + 6*npts] = x94;
+      basis_zz_eval[ipt + 7*npts] = z*(x52 + x90);
+      basis_zz_eval[ipt + 8*npts] = x93*y;
+      basis_zz_eval[ipt + 9*npts] = x95;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x45 + x77 + x88;
+      basis_lapl_eval[ipt + 1*npts] = x96*y;
+      basis_lapl_eval[ipt + 2*npts] = x96*z;
+      basis_lapl_eval[ipt + 3*npts] = x*x97;
+      basis_lapl_eval[ipt + 4*npts] = x54*(9.0*radial_eval_alpha + x43 + x98);
+      basis_lapl_eval[ipt + 5*npts] = x*x99;
+      basis_lapl_eval[ipt + 6*npts] = x58 + x84 + x94;
+      basis_lapl_eval[ipt + 7*npts] = x97*z;
+      basis_lapl_eval[ipt + 8*npts] = x99*y;
+      basis_lapl_eval[ipt + 9*npts] = x59 + x85 + x95;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x*x105 + x*x107 + x0*x109 + x100 + x101*x3 + x102 + x103 + 9.0*x48;
+      basis_lapl_x_eval[ipt + 1*npts] = x115*y;
+      basis_lapl_x_eval[ipt + 2*npts] = x115*z;
+      basis_lapl_x_eval[ipt + 3*npts] = x*x117 + x116 + x118*x2 + x119*x2 + x121 + x78 + x82 + x90;
+      basis_lapl_x_eval[ipt + 4*npts] = x19*(x*x109 + x104*x2 + x106*x2 + x120*x3 + x42 + 3.0*x44 + x98);
+      basis_lapl_x_eval[ipt + 5*npts] = x*x123 + x122 + x124*x2 + x125*x2 + x126 + x78 + x83 + x93;
+      basis_lapl_x_eval[ipt + 6*npts] = x*x130 + x*x131 + x109*x11 + x11*x129 + x128;
+      basis_lapl_x_eval[ipt + 7*npts] = z*(x*x118 + x*x119 + x117 + x129*x7 + x61);
+      basis_lapl_x_eval[ipt + 8*npts] = y*(x*x124 + x*x125 + x10*x129 + x123 + x61);
+      basis_lapl_x_eval[ipt + 9*npts] = x*x134 + x*x135 + x109*x12 + x12*x129 + x133;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x0*x136 + x0*x139 + x107*y + x128 + x138*y;
+      basis_lapl_y_eval[ipt + 1*npts] = x102 + x114*x6 + x121 + x140*y + x141*x6 + x50 + x52 + x89;
+      basis_lapl_y_eval[ipt + 2*npts] = z*(x114*y + x136*x3 + x140 + x141*y + x63);
+      basis_lapl_y_eval[ipt + 3*npts] = x*x144;
+      basis_lapl_y_eval[ipt + 4*npts] = x8*(x106*x6 + x120*x7 + x137*x6 + x139*y + x145 + 3.0*x76 + x86);
+      basis_lapl_y_eval[ipt + 5*npts] = x*(x10*x136 + x125*y + x146*y + x147 + x63);
+      basis_lapl_y_eval[ipt + 6*npts] = x100 + x101*x7 + x11*x139 + x116 + x131*y + x148 + x149*y + 9.0*x80;
+      basis_lapl_y_eval[ipt + 7*npts] = x144*z;
+      basis_lapl_y_eval[ipt + 8*npts] = x125*x6 + x146*x6 + x147*y + x150 + x151 + x52 + x57 + x93;
+      basis_lapl_y_eval[ipt + 9*npts] = x12*x136 + x12*x139 + x135*y + x152 + x153*y;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x0*x154 + x0*x155 + x105*z + x133 + x138*z;
+      basis_lapl_z_eval[ipt + 1*npts] = y*(x113*z + x141*z + x154*x3 + x156 + x72);
+      basis_lapl_z_eval[ipt + 2*npts] = x103 + x113*x9 + x126 + x141*x9 + x156*z + x50 + x56 + x79;
+      basis_lapl_z_eval[ipt + 3*npts] = x*(x118*z + x143*z + x154*x7 + x157 + x72);
+      basis_lapl_z_eval[ipt + 4*npts] = x38*(x10*x120 + x104*x9 + x137*x9 + x145 + x155*z + x75 + 3.0*x87);
+      basis_lapl_z_eval[ipt + 5*npts] = x*x159;
+      basis_lapl_z_eval[ipt + 6*npts] = x11*x154 + x11*x155 + x130*z + x149*z + x152;
+      basis_lapl_z_eval[ipt + 7*npts] = x118*x9 + x143*x9 + x148 + x151 + x157*z + x53 + x56 + x82;
+      basis_lapl_z_eval[ipt + 8*npts] = x159*y;
+      basis_lapl_z_eval[ipt + 9*npts] = x10*x101 + x100 + x12*x155 + x122 + x134*z + x150 + x153*z + 9.0*x91;
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+      double ang_eval_3;
+
+
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x3;
+      ang_eval_2 = x3*x4;
+      ang_eval_3 = x5*x7;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+      basis_eval[ipt + 3*npts] = ang_eval_3;
+
+      ang_eval_0 = x1*x8;
+      ang_eval_1 = x10*x5;
+      ang_eval_2 = radial_eval*x11;
+      ang_eval_3 = x4*x7;
+      basis_eval[ipt + 4*npts] = ang_eval_0;
+      basis_eval[ipt + 5*npts] = ang_eval_1;
+      basis_eval[ipt + 6*npts] = ang_eval_2;
+      basis_eval[ipt + 7*npts] = ang_eval_3;
+
+      ang_eval_0 = x1*x10;
+      ang_eval_1 = radial_eval*x12;
+      basis_eval[ipt + 8*npts] = ang_eval_0;
+      basis_eval[ipt + 9*npts] = ang_eval_1;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+      double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
+
+      dang_eval_x_0 = radial_eval_alpha*x13 + x14*x3;
+      dang_eval_y_0 = x0*x28;
+      dang_eval_z_0 = x0*x37;
+      dang_eval_x_1 = x15*y;
+      dang_eval_y_1 = x18 + x29;
+      dang_eval_z_1 = x30;
+      dang_eval_x_2 = x15*z;
+      dang_eval_y_2 = x30;
+      dang_eval_z_2 = x24 + x29;
+      dang_eval_x_3 = x16 + x18;
+      dang_eval_y_3 = x*x31;
+      dang_eval_z_3 = x26;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 3*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x19*x21;
+      dang_eval_y_0 = x33*x8;
+      dang_eval_z_0 = x38*(radial_eval + x39);
+      dang_eval_x_1 = x22 + x24;
+      dang_eval_y_1 = x27;
+      dang_eval_z_1 = x*x40;
+      dang_eval_x_2 = x11*x25;
+      dang_eval_y_2 = radial_eval_alpha*x34 + x14*x7;
+      dang_eval_z_2 = x11*x37;
+      dang_eval_x_3 = x26;
+      dang_eval_y_3 = x31*z;
+      dang_eval_z_3 = x16 + x36;
+      basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 5*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 5*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 5*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 6*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 6*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 6*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 7*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x27;
+      dang_eval_y_0 = x22 + x36;
+      dang_eval_z_0 = x40*y;
+      dang_eval_x_1 = x12*x25;
+      dang_eval_y_1 = x12*x28;
+      dang_eval_z_1 = radial_eval_alpha*x41 + x10*x14;
+      basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 9*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 9*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 9*npts] = dang_eval_z_1;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp
index 4811a3fd..2ef57f33 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_3(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_3(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,69 +106,165 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = radial_eval*x; 
+      const auto x5 = y*y; 
+      const auto x6 = x*z; 
+      const auto x7 = z*z; 
+      const auto x8 = y*y*y; 
+      const auto x9 = z*z*z; 
+      const auto x10 = x*x*x*x; 
+      const auto x11 = 3.0*radial_eval; 
+      const auto x12 = radial_eval_alpha*x0 + 2.0*x4; 
+      const auto x13 = radial_eval*x5; 
+      const auto x14 = x2*x5; 
+      const auto x15 = radial_eval_alpha*x14; 
+      const auto x16 = y*z; 
+      const auto x17 = radial_eval_alpha*x2; 
+      const auto x18 = radial_eval + x17; 
+      const auto x19 = radial_eval*x7; 
+      const auto x20 = x2*x7; 
+      const auto x21 = radial_eval_alpha*x20; 
+      const auto x22 = radial_eval_alpha*x; 
+      const auto x23 = x22*x5*z; 
+      const auto x24 = x22*x7*y; 
+      const auto x25 = radial_eval_alpha*y; 
+      const auto x26 = radial_eval*x2; 
+      const auto x27 = radial_eval_alpha*x16*x2; 
+      const auto x28 = radial_eval_alpha*x8 + 2.0*x1; 
+      const auto x29 = radial_eval_alpha*x5; 
+      const auto x30 = radial_eval + x29; 
+      const auto x31 = y*y*y*y; 
+      const auto x32 = x5*x7; 
+      const auto x33 = radial_eval_alpha*x32; 
+      const auto x34 = radial_eval_alpha*z; 
+      const auto x35 = x*y; 
+      const auto x36 = radial_eval_alpha*x7; 
+      const auto x37 = radial_eval_alpha*x9 + 2.0*x3; 
+      const auto x38 = z*z*z*z; 
+      const auto x39 = 6.0*radial_eval_alpha; 
+      const auto x40 = radial_eval_alpha_squared*x2; 
+      const auto x41 = radial_eval_alpha + x40; 
+      const auto x42 = x0*x39 + x0*x41 + 6.0*x4; 
+      const auto x43 = 4.0*radial_eval_alpha; 
+      const auto x44 = 2.0*radial_eval; 
+      const auto x45 = x2*x41 + x44; 
+      const auto x46 = x2*x43 + x45; 
+      const auto x47 = 2.0*radial_eval_alpha; 
+      const auto x48 = x47*x5; 
+      const auto x49 = x41*x5; 
+      const auto x50 = x*x16; 
+      const auto x51 = 3.0*radial_eval_alpha; 
+      const auto x52 = x47*x7; 
+      const auto x53 = x41*x7; 
+      const auto x54 = x41*x8; 
+      const auto x55 = x41*x9; 
+      const auto x56 = radial_eval_alpha_squared*x10 + x2*x51; 
+      const auto x57 = 2.0*x22; 
+      const auto x58 = x16*(radial_eval_alpha_squared*x0 + x57); 
+      const auto x59 = 2.0*x25; 
+      const auto x60 = radial_eval_alpha_squared*x14; 
+      const auto x61 = x29 + x60; 
+      const auto x62 = radial_eval_alpha_squared*x20; 
+      const auto x63 = x36 + x62; 
+      const auto x64 = radial_eval_alpha_squared*x31 + x5*x51; 
+      const auto x65 = x6*(radial_eval_alpha_squared*x8 + x59); 
+      const auto x66 = radial_eval_alpha_squared*x32; 
+      const auto x67 = x36 + x66; 
+      const auto x68 = 2.0*x34; 
+      const auto x69 = x35*(radial_eval_alpha_squared*x9 + x68); 
+      const auto x70 = radial_eval_alpha_squared*x38 + x51*x7; 
+      const auto x71 = radial_eval_alpha_squared*x5; 
+      const auto x72 = radial_eval_alpha + x71; 
+      const auto x73 = x0*x72; 
+      const auto x74 = x2*x47; 
+      const auto x75 = x2*x72; 
+      const auto x76 = x44 + x5*x72; 
+      const auto x77 = x43*x5 + x76; 
+      const auto x78 = x7*x72; 
+      const auto x79 = 6.0*x1 + x39*x8 + x72*x8; 
+      const auto x80 = x72*x9; 
+      const auto x81 = radial_eval_alpha_squared*x7; 
+      const auto x82 = radial_eval_alpha + x81; 
+      const auto x83 = x0*x82; 
+      const auto x84 = x2*x82; 
+      const auto x85 = x5*x82; 
+      const auto x86 = x44 + x7*x82; 
+      const auto x87 = x43*x7 + x86; 
+      const auto x88 = x8*x82; 
+      const auto x89 = 6.0*x3 + x39*x9 + x82*x9; 
+      const auto x90 = x2*x39 + x45 + x75 + x84; 
+      const auto x91 = x39*x5 + x49 + x76 + x85; 
+      const auto x92 = x39*x7 + x53 + x78 + x86; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = x4*x5;
+      basis_eval[ipt + 4*npts] = x1*x6;
+      basis_eval[ipt + 5*npts] = x4*x7;
+      basis_eval[ipt + 6*npts] = radial_eval*x8;
+      basis_eval[ipt + 7*npts] = x3*x5;
+      basis_eval[ipt + 8*npts] = x1*x7;
+      basis_eval[ipt + 9*npts] = radial_eval*x9;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = y*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 5*npts] = z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y;
-      basis_x_eval[ipt + 7*npts] = radial_eval_alpha*x*y*y*z;
-      basis_x_eval[ipt + 8*npts] = radial_eval_alpha*x*y*z*z;
-      basis_x_eval[ipt + 9*npts] = radial_eval_alpha*x*z*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x10 + x11*x2;
+      basis_x_eval[ipt + 1*npts] = x12*y;
+      basis_x_eval[ipt + 2*npts] = x12*z;
+      basis_x_eval[ipt + 3*npts] = x13 + x15;
+      basis_x_eval[ipt + 4*npts] = x16*x18;
+      basis_x_eval[ipt + 5*npts] = x19 + x21;
+      basis_x_eval[ipt + 6*npts] = x22*x8;
+      basis_x_eval[ipt + 7*npts] = x23;
+      basis_x_eval[ipt + 8*npts] = x24;
+      basis_x_eval[ipt + 9*npts] = x22*x9;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*y*z;
-      basis_y_eval[ipt + 3*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*y*z*z;
-      basis_y_eval[ipt + 6*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 7*npts] = y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 9*npts] = radial_eval_alpha*y*z*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x25;
+      basis_y_eval[ipt + 1*npts] = x15 + x26;
+      basis_y_eval[ipt + 2*npts] = x27;
+      basis_y_eval[ipt + 3*npts] = x*x28;
+      basis_y_eval[ipt + 4*npts] = x30*x6;
+      basis_y_eval[ipt + 5*npts] = x24;
+      basis_y_eval[ipt + 6*npts] = radial_eval_alpha*x31 + x11*x5;
+      basis_y_eval[ipt + 7*npts] = x28*z;
+      basis_y_eval[ipt + 8*npts] = x19 + x33;
+      basis_y_eval[ipt + 9*npts] = x25*x9;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y*z;
-      basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 6*npts] = radial_eval_alpha*y*y*y*z;
-      basis_z_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 8*npts] = y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 9*npts] = z*z*(3*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x34;
+      basis_z_eval[ipt + 1*npts] = x27;
+      basis_z_eval[ipt + 2*npts] = x21 + x26;
+      basis_z_eval[ipt + 3*npts] = x23;
+      basis_z_eval[ipt + 4*npts] = x35*(radial_eval + x36);
+      basis_z_eval[ipt + 5*npts] = x*x37;
+      basis_z_eval[ipt + 6*npts] = x34*x8;
+      basis_z_eval[ipt + 7*npts] = x13 + x33;
+      basis_z_eval[ipt + 8*npts] = x37*y;
+      basis_z_eval[ipt + 9*npts] = radial_eval_alpha*x38 + x11*x7;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = x*(6*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 1*npts] = y*(2*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 2*npts] = z*(2*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 3*npts] = x*(2*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 4*npts] = x*y*z*(9*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 5*npts] = x*(2*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_lapl_eval[ipt + 6*npts] = y*(6*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 7*npts] = z*(2*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 8*npts] = y*(2*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_lapl_eval[ipt + 9*npts] = z*(6*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
+      basis_lapl_eval[ipt + 0*npts] = x42 + x73 + x83;
+      basis_lapl_eval[ipt + 1*npts] = x90*y;
+      basis_lapl_eval[ipt + 2*npts] = x90*z;
+      basis_lapl_eval[ipt + 3*npts] = x*x91;
+      basis_lapl_eval[ipt + 4*npts] = x50*(9.0*radial_eval_alpha + x40 + x71 + x81);
+      basis_lapl_eval[ipt + 5*npts] = x*x92;
+      basis_lapl_eval[ipt + 6*npts] = x54 + x79 + x88;
+      basis_lapl_eval[ipt + 7*npts] = x91*z;
+      basis_lapl_eval[ipt + 8*npts] = x92*y;
+      basis_lapl_eval[ipt + 9*npts] = x55 + x80 + x89;
+
 
 
 
@@ -181,26 +280,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x;
-      ang_eval_1 = radial_eval*x*x*y;
-      ang_eval_2 = radial_eval*x*x*z;
-      ang_eval_3 = radial_eval*x*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = x4*x5;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z;
-      ang_eval_1 = radial_eval*x*z*z;
-      ang_eval_2 = radial_eval*y*y*y;
-      ang_eval_3 = radial_eval*y*y*z;
+      ang_eval_0 = x1*x6;
+      ang_eval_1 = x4*x7;
+      ang_eval_2 = radial_eval*x8;
+      ang_eval_3 = x3*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*z*z;
-      ang_eval_1 = radial_eval*z*z*z;
+      ang_eval_0 = x1*x7;
+      ang_eval_1 = radial_eval*x9;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
 
@@ -210,18 +309,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*x*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*x*z;
-      dang_eval_x_1 = x*y*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*x*y*z;
-      dang_eval_x_2 = x*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*x*y*z;
-      dang_eval_z_2 = x*x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = y*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*x*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*x10 + x11*x2;
+      dang_eval_y_0 = x0*x25;
+      dang_eval_z_0 = x0*x34;
+      dang_eval_x_1 = x12*y;
+      dang_eval_y_1 = x15 + x26;
+      dang_eval_z_1 = x27;
+      dang_eval_x_2 = x12*z;
+      dang_eval_y_2 = x27;
+      dang_eval_z_2 = x21 + x26;
+      dang_eval_x_3 = x13 + x15;
+      dang_eval_y_3 = x*x28;
+      dang_eval_z_3 = x23;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -235,18 +334,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*y*z*z;
-      dang_eval_z_1 = x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*y*y*y;
-      dang_eval_y_2 = y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*y*y*y*z;
-      dang_eval_x_3 = radial_eval_alpha*x*y*y*z;
-      dang_eval_y_3 = y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x16*x18;
+      dang_eval_y_0 = x30*x6;
+      dang_eval_z_0 = x35*(radial_eval + x36);
+      dang_eval_x_1 = x19 + x21;
+      dang_eval_y_1 = x24;
+      dang_eval_z_1 = x*x37;
+      dang_eval_x_2 = x22*x8;
+      dang_eval_y_2 = radial_eval_alpha*x31 + x11*x5;
+      dang_eval_z_2 = x34*x8;
+      dang_eval_x_3 = x23;
+      dang_eval_y_3 = x28*z;
+      dang_eval_z_3 = x13 + x33;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -260,12 +359,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*z*z;
-      dang_eval_y_0 = z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*z*z*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z*z*z;
-      dang_eval_z_1 = z*z*(3*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x24;
+      dang_eval_y_0 = x19 + x33;
+      dang_eval_z_0 = x37*y;
+      dang_eval_x_1 = x22*x9;
+      dang_eval_y_1 = x25*x9;
+      dang_eval_z_1 = radial_eval_alpha*x38 + x11*x7;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp
index 433ecd3a..65bb118b 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,24 +96,34 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval*y; 
+      const auto x1 = x*x*x; 
+      const auto x2 = radial_eval*z; 
+      const auto x3 = x*x; 
+      const auto x4 = y*y; 
+      const auto x5 = z*z; 
+      const auto x6 = radial_eval*x; 
+      const auto x7 = y*y*y; 
+      const auto x8 = z*z*z; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z;
-      basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y;
-      basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z;
-      basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z;
-      basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z;
-      basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*(x*x*x*x);
+      basis_eval[ipt + 1*npts] = x0*x1;
+      basis_eval[ipt + 2*npts] = x1*x2;
+      basis_eval[ipt + 3*npts] = radial_eval*x3*x4;
+      basis_eval[ipt + 4*npts] = x0*x3*z;
+      basis_eval[ipt + 5*npts] = radial_eval*x3*x5;
+      basis_eval[ipt + 6*npts] = x6*x7;
+      basis_eval[ipt + 7*npts] = x*x2*x4;
+      basis_eval[ipt + 8*npts] = x*x0*x5;
+      basis_eval[ipt + 9*npts] = x6*x8;
+      basis_eval[ipt + 10*npts] = radial_eval*(y*y*y*y);
+      basis_eval[ipt + 11*npts] = x2*x7;
+      basis_eval[ipt + 12*npts] = radial_eval*x4*x5;
+      basis_eval[ipt + 13*npts] = x0*x8;
+      basis_eval[ipt + 14*npts] = radial_eval*(z*z*z*z);
 
 
     
@@ -119,6 +132,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
@@ -130,36 +145,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x*x;
-      ang_eval_1 = radial_eval*x*x*x*y;
-      ang_eval_2 = radial_eval*x*x*x*z;
-      ang_eval_3 = radial_eval*x*x*y*y;
+      ang_eval_0 = radial_eval*(x*x*x*x);
+      ang_eval_1 = x0*x1;
+      ang_eval_2 = x1*x2;
+      ang_eval_3 = radial_eval*x3*x4;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*x*y*z;
-      ang_eval_1 = radial_eval*x*x*z*z;
-      ang_eval_2 = radial_eval*x*y*y*y;
-      ang_eval_3 = radial_eval*x*y*y*z;
+      ang_eval_0 = x0*x3*z;
+      ang_eval_1 = radial_eval*x3*x5;
+      ang_eval_2 = x6*x7;
+      ang_eval_3 = x*x2*x4;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z*z;
-      ang_eval_1 = radial_eval*x*z*z*z;
-      ang_eval_2 = radial_eval*y*y*y*y;
-      ang_eval_3 = radial_eval*y*y*y*z;
+      ang_eval_0 = x*x0*x5;
+      ang_eval_1 = x6*x8;
+      ang_eval_2 = radial_eval*(y*y*y*y);
+      ang_eval_3 = x2*x7;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
       basis_eval[ipt + 10*npts] = ang_eval_2;
       basis_eval[ipt + 11*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*y*z*z;
-      ang_eval_1 = radial_eval*y*z*z*z;
-      ang_eval_2 = radial_eval*z*z*z*z;
+      ang_eval_0 = radial_eval*x4*x5;
+      ang_eval_1 = x0*x8;
+      ang_eval_2 = radial_eval*(z*z*z*z);
       basis_eval[ipt + 12*npts] = ang_eval_0;
       basis_eval[ipt + 13*npts] = ang_eval_1;
       basis_eval[ipt + 14*npts] = ang_eval_2;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp
index 104fdba8..ea90a944 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_4(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_4(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,77 +102,125 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = x*x; 
+      const auto x5 = y*y; 
+      const auto x6 = x4*x5; 
+      const auto x7 = z*z; 
+      const auto x8 = x4*x7; 
+      const auto x9 = radial_eval*x; 
+      const auto x10 = y*y*y; 
+      const auto x11 = z*z*z; 
+      const auto x12 = y*y*y*y; 
+      const auto x13 = x5*x7; 
+      const auto x14 = z*z*z*z; 
+      const auto x15 = 4.0*radial_eval; 
+      const auto x16 = 3.0*radial_eval; 
+      const auto x17 = radial_eval_alpha*x0 + x16*x4; 
+      const auto x18 = 2.0*x9; 
+      const auto x19 = radial_eval_alpha*x2*x5; 
+      const auto x20 = y*z; 
+      const auto x21 = radial_eval_alpha*x2*x7; 
+      const auto x22 = radial_eval*x10; 
+      const auto x23 = radial_eval_alpha*x10*x4; 
+      const auto x24 = radial_eval*x5; 
+      const auto x25 = radial_eval_alpha*x6; 
+      const auto x26 = radial_eval*x7; 
+      const auto x27 = radial_eval_alpha*x8; 
+      const auto x28 = radial_eval*x11; 
+      const auto x29 = radial_eval_alpha*x11*x4; 
+      const auto x30 = radial_eval_alpha*x; 
+      const auto x31 = x10*x30*z; 
+      const auto x32 = x11*x30*y; 
+      const auto x33 = radial_eval_alpha*y; 
+      const auto x34 = radial_eval*x2; 
+      const auto x35 = radial_eval_alpha*x2*x20; 
+      const auto x36 = 2.0*x1; 
+      const auto x37 = radial_eval*x4; 
+      const auto x38 = radial_eval_alpha*x12 + x16*x5; 
+      const auto x39 = radial_eval_alpha*x13; 
+      const auto x40 = radial_eval_alpha*x10*x7; 
+      const auto x41 = radial_eval_alpha*x11*x5; 
+      const auto x42 = radial_eval_alpha*z; 
+      const auto x43 = 2.0*x3; 
+      const auto x44 = radial_eval_alpha*x14 + x16*x7; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z;
-      basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y;
-      basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z;
-      basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z;
-      basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z;
-      basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = radial_eval*x6;
+      basis_eval[ipt + 4*npts] = x1*x4*z;
+      basis_eval[ipt + 5*npts] = radial_eval*x8;
+      basis_eval[ipt + 6*npts] = x10*x9;
+      basis_eval[ipt + 7*npts] = x*x3*x5;
+      basis_eval[ipt + 8*npts] = x*x1*x7;
+      basis_eval[ipt + 9*npts] = x11*x9;
+      basis_eval[ipt + 10*npts] = radial_eval*x12;
+      basis_eval[ipt + 11*npts] = x10*x3;
+      basis_eval[ipt + 12*npts] = radial_eval*x13;
+      basis_eval[ipt + 13*npts] = x1*x11;
+      basis_eval[ipt + 14*npts] = radial_eval*x14;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y;
-      basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z;
-      basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z;
-      basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z;
-      basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*(x*x*x*x*x) + x15*x2;
+      basis_x_eval[ipt + 1*npts] = x17*y;
+      basis_x_eval[ipt + 2*npts] = x17*z;
+      basis_x_eval[ipt + 3*npts] = x18*x5 + x19;
+      basis_x_eval[ipt + 4*npts] = x20*(radial_eval_alpha*x2 + x18);
+      basis_x_eval[ipt + 5*npts] = x18*x7 + x21;
+      basis_x_eval[ipt + 6*npts] = x22 + x23;
+      basis_x_eval[ipt + 7*npts] = z*(x24 + x25);
+      basis_x_eval[ipt + 8*npts] = y*(x26 + x27);
+      basis_x_eval[ipt + 9*npts] = x28 + x29;
+      basis_x_eval[ipt + 10*npts] = x12*x30;
+      basis_x_eval[ipt + 11*npts] = x31;
+      basis_x_eval[ipt + 12*npts] = x13*x30;
+      basis_x_eval[ipt + 13*npts] = x32;
+      basis_x_eval[ipt + 14*npts] = x14*x30;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z;
-      basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z;
-      basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z;
-      basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x33;
+      basis_y_eval[ipt + 1*npts] = x19 + x34;
+      basis_y_eval[ipt + 2*npts] = x35;
+      basis_y_eval[ipt + 3*npts] = x23 + x36*x4;
+      basis_y_eval[ipt + 4*npts] = z*(x25 + x37);
+      basis_y_eval[ipt + 5*npts] = x33*x8;
+      basis_y_eval[ipt + 6*npts] = x*x38;
+      basis_y_eval[ipt + 7*npts] = x*z*(radial_eval_alpha*x10 + x36);
+      basis_y_eval[ipt + 8*npts] = x*(x26 + x39);
+      basis_y_eval[ipt + 9*npts] = x32;
+      basis_y_eval[ipt + 10*npts] = radial_eval_alpha*(y*y*y*y*y) + x10*x15;
+      basis_y_eval[ipt + 11*npts] = x38*z;
+      basis_y_eval[ipt + 12*npts] = x36*x7 + x40;
+      basis_y_eval[ipt + 13*npts] = x28 + x41;
+      basis_y_eval[ipt + 14*npts] = x14*x33;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z;
-      basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z;
-      basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z;
-      basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x42;
+      basis_z_eval[ipt + 1*npts] = x35;
+      basis_z_eval[ipt + 2*npts] = x21 + x34;
+      basis_z_eval[ipt + 3*npts] = x42*x6;
+      basis_z_eval[ipt + 4*npts] = y*(x27 + x37);
+      basis_z_eval[ipt + 5*npts] = x29 + x4*x43;
+      basis_z_eval[ipt + 6*npts] = x31;
+      basis_z_eval[ipt + 7*npts] = x*(x24 + x39);
+      basis_z_eval[ipt + 8*npts] = x*y*(radial_eval_alpha*x11 + x43);
+      basis_z_eval[ipt + 9*npts] = x*x44;
+      basis_z_eval[ipt + 10*npts] = x12*x42;
+      basis_z_eval[ipt + 11*npts] = x22 + x40;
+      basis_z_eval[ipt + 12*npts] = x41 + x43*x5;
+      basis_z_eval[ipt + 13*npts] = x44*y;
+      basis_z_eval[ipt + 14*npts] = radial_eval_alpha*(z*z*z*z*z) + x11*x15;
+
+
 
 
 
@@ -186,36 +237,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x*x;
-      ang_eval_1 = radial_eval*x*x*x*y;
-      ang_eval_2 = radial_eval*x*x*x*z;
-      ang_eval_3 = radial_eval*x*x*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = radial_eval*x6;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*x*y*z;
-      ang_eval_1 = radial_eval*x*x*z*z;
-      ang_eval_2 = radial_eval*x*y*y*y;
-      ang_eval_3 = radial_eval*x*y*y*z;
+      ang_eval_0 = x1*x4*z;
+      ang_eval_1 = radial_eval*x8;
+      ang_eval_2 = x10*x9;
+      ang_eval_3 = x*x3*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z*z;
-      ang_eval_1 = radial_eval*x*z*z*z;
-      ang_eval_2 = radial_eval*y*y*y*y;
-      ang_eval_3 = radial_eval*y*y*y*z;
+      ang_eval_0 = x*x1*x7;
+      ang_eval_1 = x11*x9;
+      ang_eval_2 = radial_eval*x12;
+      ang_eval_3 = x10*x3;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
       basis_eval[ipt + 10*npts] = ang_eval_2;
       basis_eval[ipt + 11*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*y*z*z;
-      ang_eval_1 = radial_eval*y*z*z*z;
-      ang_eval_2 = radial_eval*z*z*z*z;
+      ang_eval_0 = radial_eval*x13;
+      ang_eval_1 = x1*x11;
+      ang_eval_2 = radial_eval*x14;
       basis_eval[ipt + 12*npts] = ang_eval_0;
       basis_eval[ipt + 13*npts] = ang_eval_1;
       basis_eval[ipt + 14*npts] = ang_eval_2;
@@ -226,18 +277,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z;
-      dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z;
-      dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z;
-      dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*(x*x*x*x*x) + x15*x2;
+      dang_eval_y_0 = x0*x33;
+      dang_eval_z_0 = x0*x42;
+      dang_eval_x_1 = x17*y;
+      dang_eval_y_1 = x19 + x34;
+      dang_eval_z_1 = x35;
+      dang_eval_x_2 = x17*z;
+      dang_eval_y_2 = x35;
+      dang_eval_z_2 = x21 + x34;
+      dang_eval_x_3 = x18*x5 + x19;
+      dang_eval_y_3 = x23 + x36*x4;
+      dang_eval_z_3 = x42*x6;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -251,18 +302,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z;
-      dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z;
-      dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x20*(radial_eval_alpha*x2 + x18);
+      dang_eval_y_0 = z*(x25 + x37);
+      dang_eval_z_0 = y*(x27 + x37);
+      dang_eval_x_1 = x18*x7 + x21;
+      dang_eval_y_1 = x33*x8;
+      dang_eval_z_1 = x29 + x4*x43;
+      dang_eval_x_2 = x22 + x23;
+      dang_eval_y_2 = x*x38;
+      dang_eval_z_2 = x31;
+      dang_eval_x_3 = z*(x24 + x25);
+      dang_eval_y_3 = x*z*(radial_eval_alpha*x10 + x36);
+      dang_eval_z_3 = x*(x24 + x39);
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -276,18 +327,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z;
-      dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y;
-      dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z;
-      dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z;
-      dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = y*(x26 + x27);
+      dang_eval_y_0 = x*(x26 + x39);
+      dang_eval_z_0 = x*y*(radial_eval_alpha*x11 + x43);
+      dang_eval_x_1 = x28 + x29;
+      dang_eval_y_1 = x32;
+      dang_eval_z_1 = x*x44;
+      dang_eval_x_2 = x12*x30;
+      dang_eval_y_2 = radial_eval_alpha*(y*y*y*y*y) + x10*x15;
+      dang_eval_z_2 = x12*x42;
+      dang_eval_x_3 = x31;
+      dang_eval_y_3 = x38*z;
+      dang_eval_z_3 = x22 + x40;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
@@ -301,15 +352,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 11*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 11*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z;
-      dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z;
-      dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z;
-      dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z;
-      dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x13*x30;
+      dang_eval_y_0 = x36*x7 + x40;
+      dang_eval_z_0 = x41 + x43*x5;
+      dang_eval_x_1 = x32;
+      dang_eval_y_1 = x28 + x41;
+      dang_eval_z_1 = x44*y;
+      dang_eval_x_2 = x14*x30;
+      dang_eval_y_2 = x14*x33;
+      dang_eval_z_2 = radial_eval_alpha*(z*z*z*z*z) + x11*x15;
       basis_x_eval[ipt + 12*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 12*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 12*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp
index 1a9958bd..99c58bbc 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_4(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_4(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,179 +111,340 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = x*x; 
+      const auto x5 = y*y; 
+      const auto x6 = x4*x5; 
+      const auto x7 = x1*z; 
+      const auto x8 = z*z; 
+      const auto x9 = x4*x8; 
+      const auto x10 = radial_eval*x; 
+      const auto x11 = y*y*y; 
+      const auto x12 = x*x3; 
+      const auto x13 = x*x1; 
+      const auto x14 = z*z*z; 
+      const auto x15 = y*y*y*y; 
+      const auto x16 = x5*x8; 
+      const auto x17 = z*z*z*z; 
+      const auto x18 = x*x*x*x*x; 
+      const auto x19 = 4.0*radial_eval; 
+      const auto x20 = 3.0*radial_eval; 
+      const auto x21 = radial_eval_alpha*x0 + x20*x4; 
+      const auto x22 = 2.0*x10; 
+      const auto x23 = x2*x5; 
+      const auto x24 = radial_eval_alpha*x23; 
+      const auto x25 = y*z; 
+      const auto x26 = radial_eval_alpha*x2; 
+      const auto x27 = x22 + x26; 
+      const auto x28 = x2*x8; 
+      const auto x29 = radial_eval_alpha*x28; 
+      const auto x30 = radial_eval*x11; 
+      const auto x31 = x11*x4; 
+      const auto x32 = radial_eval_alpha*x31; 
+      const auto x33 = radial_eval*x5; 
+      const auto x34 = radial_eval_alpha*x6; 
+      const auto x35 = x33 + x34; 
+      const auto x36 = radial_eval*x8; 
+      const auto x37 = radial_eval_alpha*x9; 
+      const auto x38 = x36 + x37; 
+      const auto x39 = radial_eval*x14; 
+      const auto x40 = x14*x4; 
+      const auto x41 = radial_eval_alpha*x40; 
+      const auto x42 = radial_eval_alpha*x; 
+      const auto x43 = x11*x42*z; 
+      const auto x44 = x14*x42*y; 
+      const auto x45 = radial_eval_alpha*y; 
+      const auto x46 = radial_eval*x2; 
+      const auto x47 = radial_eval_alpha*x2*x25; 
+      const auto x48 = 2.0*x1; 
+      const auto x49 = radial_eval*x4; 
+      const auto x50 = x34 + x49; 
+      const auto x51 = radial_eval_alpha*x15 + x20*x5; 
+      const auto x52 = x*z; 
+      const auto x53 = radial_eval_alpha*x11; 
+      const auto x54 = x48 + x53; 
+      const auto x55 = radial_eval_alpha*x16; 
+      const auto x56 = y*y*y*y*y; 
+      const auto x57 = x11*x8; 
+      const auto x58 = radial_eval_alpha*x57; 
+      const auto x59 = x14*x5; 
+      const auto x60 = radial_eval_alpha*x59; 
+      const auto x61 = radial_eval_alpha*z; 
+      const auto x62 = 2.0*x3; 
+      const auto x63 = x*y; 
+      const auto x64 = radial_eval_alpha*x14; 
+      const auto x65 = x62 + x64; 
+      const auto x66 = radial_eval_alpha*x17 + x20*x8; 
+      const auto x67 = z*z*z*z*z; 
+      const auto x68 = 12.0*radial_eval; 
+      const auto x69 = 8.0*radial_eval_alpha; 
+      const auto x70 = radial_eval_alpha + radial_eval_alpha_squared*x4; 
+      const auto x71 = x0*x69 + x0*x70 + x4*x68; 
+      const auto x72 = 6.0*radial_eval_alpha; 
+      const auto x73 = 6.0*x10 + x2*x70; 
+      const auto x74 = x2*x72 + x73; 
+      const auto x75 = 4.0*radial_eval_alpha; 
+      const auto x76 = x6*x75; 
+      const auto x77 = 2.0*radial_eval; 
+      const auto x78 = x5*x77; 
+      const auto x79 = x4*x5*x70 + x78; 
+      const auto x80 = x4*x70 + x77; 
+      const auto x81 = x75*x9; 
+      const auto x82 = x77*x8; 
+      const auto x83 = x4*x70*x8 + x82; 
+      const auto x84 = 2.0*radial_eval_alpha; 
+      const auto x85 = x11*x84; 
+      const auto x86 = x11*x70; 
+      const auto x87 = x5*x84; 
+      const auto x88 = x5*x70; 
+      const auto x89 = x8*x84; 
+      const auto x90 = x70*x8; 
+      const auto x91 = x14*x84; 
+      const auto x92 = x14*x70; 
+      const auto x93 = x15*x70; 
+      const auto x94 = x5*x70*x8; 
+      const auto x95 = x17*x70; 
+      const auto x96 = radial_eval_alpha_squared*x18 + x2*x75; 
+      const auto x97 = 3.0*radial_eval_alpha; 
+      const auto x98 = x6*x97; 
+      const auto x99 = x25*(radial_eval_alpha_squared*x0 + x4*x97); 
+      const auto x100 = 2.0*x42; 
+      const auto x101 = 2.0*x45; 
+      const auto x102 = radial_eval_alpha_squared*x23; 
+      const auto x103 = x100*x5 + x102; 
+      const auto x104 = radial_eval_alpha_squared*x28; 
+      const auto x105 = x100*x8 + x104; 
+      const auto x106 = radial_eval_alpha_squared*x31; 
+      const auto x107 = x101*x4 + x106; 
+      const auto x108 = radial_eval_alpha_squared*x4*x5*x8; 
+      const auto x109 = x108 + x55; 
+      const auto x110 = radial_eval_alpha_squared*x40; 
+      const auto x111 = radial_eval_alpha_squared*x56 + x11*x75; 
+      const auto x112 = x52*(radial_eval_alpha_squared*x15 + x5*x97); 
+      const auto x113 = radial_eval_alpha_squared*x57; 
+      const auto x114 = x101*x8 + x113; 
+      const auto x115 = radial_eval_alpha_squared*x59; 
+      const auto x116 = x9*x97; 
+      const auto x117 = 2.0*x61; 
+      const auto x118 = x110 + x117*x4; 
+      const auto x119 = x115 + x117*x5; 
+      const auto x120 = x63*(radial_eval_alpha_squared*x17 + x8*x97); 
+      const auto x121 = radial_eval_alpha_squared*x67 + x14*x75; 
+      const auto x122 = radial_eval_alpha + radial_eval_alpha_squared*x5; 
+      const auto x123 = x0*x122; 
+      const auto x124 = x2*x84; 
+      const auto x125 = x122*x2; 
+      const auto x126 = x4*x77; 
+      const auto x127 = x122*x4*x5 + x126; 
+      const auto x128 = x4*x84; 
+      const auto x129 = x122*x4; 
+      const auto x130 = x122*x4*x8; 
+      const auto x131 = 6.0*x1 + x11*x122; 
+      const auto x132 = x11*x72 + x131; 
+      const auto x133 = x122*x5 + x77; 
+      const auto x134 = x122*x8; 
+      const auto x135 = x122*x14; 
+      const auto x136 = x122*x15 + x15*x69 + x5*x68; 
+      const auto x137 = x16*x75; 
+      const auto x138 = x122*x5*x8 + x82; 
+      const auto x139 = x122*x17; 
+      const auto x140 = x16*x97; 
+      const auto x141 = radial_eval_alpha + radial_eval_alpha_squared*x8; 
+      const auto x142 = x0*x141; 
+      const auto x143 = x141*x2; 
+      const auto x144 = x141*x4*x5; 
+      const auto x145 = x141*x4; 
+      const auto x146 = x126 + x141*x4*x8; 
+      const auto x147 = x11*x141; 
+      const auto x148 = x141*x5; 
+      const auto x149 = x141*x8 + x77; 
+      const auto x150 = x14*x141 + 6.0*x3; 
+      const auto x151 = x14*x72 + x150; 
+      const auto x152 = x141*x15; 
+      const auto x153 = x141*x5*x8 + x78; 
+      const auto x154 = x141*x17 + x17*x69 + x68*x8; 
+      const auto x155 = x125 + x143 + x2*x69 + x73; 
+      const auto x156 = x11*x69 + x131 + x147 + x86; 
+      const auto x157 = x135 + x14*x69 + x150 + x92; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z;
-      basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y;
-      basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z;
-      basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z;
-      basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z;
-      basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = radial_eval*x6;
+      basis_eval[ipt + 4*npts] = x4*x7;
+      basis_eval[ipt + 5*npts] = radial_eval*x9;
+      basis_eval[ipt + 6*npts] = x10*x11;
+      basis_eval[ipt + 7*npts] = x12*x5;
+      basis_eval[ipt + 8*npts] = x13*x8;
+      basis_eval[ipt + 9*npts] = x10*x14;
+      basis_eval[ipt + 10*npts] = radial_eval*x15;
+      basis_eval[ipt + 11*npts] = x11*x3;
+      basis_eval[ipt + 12*npts] = radial_eval*x16;
+      basis_eval[ipt + 13*npts] = x1*x14;
+      basis_eval[ipt + 14*npts] = radial_eval*x17;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y;
-      basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z;
-      basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z;
-      basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z;
-      basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x18 + x19*x2;
+      basis_x_eval[ipt + 1*npts] = x21*y;
+      basis_x_eval[ipt + 2*npts] = x21*z;
+      basis_x_eval[ipt + 3*npts] = x22*x5 + x24;
+      basis_x_eval[ipt + 4*npts] = x25*x27;
+      basis_x_eval[ipt + 5*npts] = x22*x8 + x29;
+      basis_x_eval[ipt + 6*npts] = x30 + x32;
+      basis_x_eval[ipt + 7*npts] = x35*z;
+      basis_x_eval[ipt + 8*npts] = x38*y;
+      basis_x_eval[ipt + 9*npts] = x39 + x41;
+      basis_x_eval[ipt + 10*npts] = x15*x42;
+      basis_x_eval[ipt + 11*npts] = x43;
+      basis_x_eval[ipt + 12*npts] = x16*x42;
+      basis_x_eval[ipt + 13*npts] = x44;
+      basis_x_eval[ipt + 14*npts] = x17*x42;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z;
-      basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z;
-      basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z;
-      basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x45;
+      basis_y_eval[ipt + 1*npts] = x24 + x46;
+      basis_y_eval[ipt + 2*npts] = x47;
+      basis_y_eval[ipt + 3*npts] = x32 + x4*x48;
+      basis_y_eval[ipt + 4*npts] = x50*z;
+      basis_y_eval[ipt + 5*npts] = x45*x9;
+      basis_y_eval[ipt + 6*npts] = x*x51;
+      basis_y_eval[ipt + 7*npts] = x52*x54;
+      basis_y_eval[ipt + 8*npts] = x*(x36 + x55);
+      basis_y_eval[ipt + 9*npts] = x44;
+      basis_y_eval[ipt + 10*npts] = radial_eval_alpha*x56 + x11*x19;
+      basis_y_eval[ipt + 11*npts] = x51*z;
+      basis_y_eval[ipt + 12*npts] = x48*x8 + x58;
+      basis_y_eval[ipt + 13*npts] = x39 + x60;
+      basis_y_eval[ipt + 14*npts] = x17*x45;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z;
-      basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z;
-      basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z;
-      basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x61;
+      basis_z_eval[ipt + 1*npts] = x47;
+      basis_z_eval[ipt + 2*npts] = x29 + x46;
+      basis_z_eval[ipt + 3*npts] = x6*x61;
+      basis_z_eval[ipt + 4*npts] = y*(x37 + x49);
+      basis_z_eval[ipt + 5*npts] = x4*x62 + x41;
+      basis_z_eval[ipt + 6*npts] = x43;
+      basis_z_eval[ipt + 7*npts] = x*(x33 + x55);
+      basis_z_eval[ipt + 8*npts] = x63*x65;
+      basis_z_eval[ipt + 9*npts] = x*x66;
+      basis_z_eval[ipt + 10*npts] = x15*x61;
+      basis_z_eval[ipt + 11*npts] = x30 + x58;
+      basis_z_eval[ipt + 12*npts] = x5*x62 + x60;
+      basis_z_eval[ipt + 13*npts] = x66*y;
+      basis_z_eval[ipt + 14*npts] = radial_eval_alpha*x67 + x14*x19;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = x*x*(12*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 1*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 2*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 3*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 4*npts] = y*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 5*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_xx_eval[ipt + 6*npts] = x*y*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 9*npts] = x*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 11*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 12*npts] = y*y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 13*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xx_eval[ipt + 0*npts] = x71;
+      basis_xx_eval[ipt + 1*npts] = x74*y;
+      basis_xx_eval[ipt + 2*npts] = x74*z;
+      basis_xx_eval[ipt + 3*npts] = x76 + x79;
+      basis_xx_eval[ipt + 4*npts] = x25*(x4*x75 + x80);
+      basis_xx_eval[ipt + 5*npts] = x81 + x83;
+      basis_xx_eval[ipt + 6*npts] = x*(x85 + x86);
+      basis_xx_eval[ipt + 7*npts] = x52*(x87 + x88);
+      basis_xx_eval[ipt + 8*npts] = x63*(x89 + x90);
+      basis_xx_eval[ipt + 9*npts] = x*(x91 + x92);
+      basis_xx_eval[ipt + 10*npts] = x93;
+      basis_xx_eval[ipt + 11*npts] = x86*z;
+      basis_xx_eval[ipt + 12*npts] = x94;
+      basis_xx_eval[ipt + 13*npts] = x92*y;
+      basis_xx_eval[ipt + 14*npts] = x95;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = x*x*x*y*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 1*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 2*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 3*npts] = x*y*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 4*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 5*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 6*npts] = y*y*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 7*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 9*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 10*npts] = x*y*y*y*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 11*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 12*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 13*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x*y*z*z*z*z;
+      basis_xy_eval[ipt + 0*npts] = x96*y;
+      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x5 + x21 + x98;
+      basis_xy_eval[ipt + 2*npts] = x99;
+      basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x11*x2 + x100*x11 + x101*x2 + 4.0*x13;
+      basis_xy_eval[ipt + 4*npts] = z*(x103 + x27);
+      basis_xy_eval[ipt + 5*npts] = x105*y;
+      basis_xy_eval[ipt + 6*npts] = radial_eval_alpha_squared*x15*x4 + x51 + x98;
+      basis_xy_eval[ipt + 7*npts] = z*(x107 + x54);
+      basis_xy_eval[ipt + 8*npts] = x109 + x38;
+      basis_xy_eval[ipt + 9*npts] = y*(x110 + x64);
+      basis_xy_eval[ipt + 10*npts] = x*x111;
+      basis_xy_eval[ipt + 11*npts] = x112;
+      basis_xy_eval[ipt + 12*npts] = x*x114;
+      basis_xy_eval[ipt + 13*npts] = x*(x115 + x64);
+      basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x17*x63;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = x*x*x*z*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 1*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 2*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 3*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 4*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 5*npts] = x*z*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 6*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 8*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 9*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x*y*y*y*y*z;
-      basis_xz_eval[ipt + 11*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 12*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 13*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 14*npts] = x*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_xz_eval[ipt + 0*npts] = x96*z;
+      basis_xz_eval[ipt + 1*npts] = x99;
+      basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x8 + x116 + x21;
+      basis_xz_eval[ipt + 3*npts] = x103*z;
+      basis_xz_eval[ipt + 4*npts] = y*(x105 + x27);
+      basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x14*x2 + x100*x14 + x117*x2 + 4.0*x12;
+      basis_xz_eval[ipt + 6*npts] = z*(x106 + x53);
+      basis_xz_eval[ipt + 7*npts] = x109 + x35;
+      basis_xz_eval[ipt + 8*npts] = y*(x118 + x65);
+      basis_xz_eval[ipt + 9*npts] = radial_eval_alpha_squared*x17*x4 + x116 + x66;
+      basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x15*x52;
+      basis_xz_eval[ipt + 11*npts] = x*(x113 + x53);
+      basis_xz_eval[ipt + 12*npts] = x*x119;
+      basis_xz_eval[ipt + 13*npts] = x120;
+      basis_xz_eval[ipt + 14*npts] = x*x121;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 1*npts] = x*x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 3*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 5*npts] = x*x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 6*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 7*npts] = x*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 9*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 10*npts] = y*y*(12*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 11*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 12*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_yy_eval[ipt + 13*npts] = y*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
+      basis_yy_eval[ipt + 0*npts] = x123;
+      basis_yy_eval[ipt + 1*npts] = y*(x124 + x125);
+      basis_yy_eval[ipt + 2*npts] = x125*z;
+      basis_yy_eval[ipt + 3*npts] = x127 + x76;
+      basis_yy_eval[ipt + 4*npts] = x25*(x128 + x129);
+      basis_yy_eval[ipt + 5*npts] = x130;
+      basis_yy_eval[ipt + 6*npts] = x*x132;
+      basis_yy_eval[ipt + 7*npts] = x52*(x133 + x5*x75);
+      basis_yy_eval[ipt + 8*npts] = x63*(x134 + x89);
+      basis_yy_eval[ipt + 9*npts] = x*x135;
+      basis_yy_eval[ipt + 10*npts] = x136;
+      basis_yy_eval[ipt + 11*npts] = x132*z;
+      basis_yy_eval[ipt + 12*npts] = x137 + x138;
+      basis_yy_eval[ipt + 13*npts] = y*(x135 + x91);
+      basis_yy_eval[ipt + 14*npts] = x139;
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*x*x*y*z;
-      basis_yz_eval[ipt + 1*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 2*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 3*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 4*npts] = x*x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 5*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 6*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 7*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 8*npts] = x*z*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 9*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 10*npts] = y*y*y*z*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 11*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 12*npts] = y*z*(4*radial_eval + 2*radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 13*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 14*npts] = y*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x25;
+      basis_yz_eval[ipt + 1*npts] = z*(x102 + x26);
+      basis_yz_eval[ipt + 2*npts] = y*(x104 + x26);
+      basis_yz_eval[ipt + 3*npts] = x107*z;
+      basis_yz_eval[ipt + 4*npts] = x108 + x37 + x50;
+      basis_yz_eval[ipt + 5*npts] = x118*y;
+      basis_yz_eval[ipt + 6*npts] = x112;
+      basis_yz_eval[ipt + 7*npts] = x*(x114 + x54);
+      basis_yz_eval[ipt + 8*npts] = x*(x119 + x65);
+      basis_yz_eval[ipt + 9*npts] = x120;
+      basis_yz_eval[ipt + 10*npts] = x111*z;
+      basis_yz_eval[ipt + 11*npts] = radial_eval_alpha_squared*x15*x8 + x140 + x51;
+      basis_yz_eval[ipt + 12*npts] = radial_eval_alpha_squared*x11*x14 + x101*x14 + x11*x117 + 4.0*x7;
+      basis_yz_eval[ipt + 13*npts] = radial_eval_alpha_squared*x17*x5 + x140 + x66;
+      basis_yz_eval[ipt + 14*npts] = x121*y;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 1*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = x*x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 3*npts] = x*x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 5*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 6*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 8*npts] = x*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 9*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 11*npts] = y*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 12*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 13*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_zz_eval[ipt + 14*npts] = z*z*(12*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z);
+      basis_zz_eval[ipt + 0*npts] = x142;
+      basis_zz_eval[ipt + 1*npts] = x143*y;
+      basis_zz_eval[ipt + 2*npts] = z*(x124 + x143);
+      basis_zz_eval[ipt + 3*npts] = x144;
+      basis_zz_eval[ipt + 4*npts] = x25*(x128 + x145);
+      basis_zz_eval[ipt + 5*npts] = x146 + x81;
+      basis_zz_eval[ipt + 6*npts] = x*x147;
+      basis_zz_eval[ipt + 7*npts] = x52*(x148 + x87);
+      basis_zz_eval[ipt + 8*npts] = x63*(x149 + x75*x8);
+      basis_zz_eval[ipt + 9*npts] = x*x151;
+      basis_zz_eval[ipt + 10*npts] = x152;
+      basis_zz_eval[ipt + 11*npts] = z*(x147 + x85);
+      basis_zz_eval[ipt + 12*npts] = x137 + x153;
+      basis_zz_eval[ipt + 13*npts] = x151*y;
+      basis_zz_eval[ipt + 14*npts] = x154;
+
+
 
 
 
@@ -296,36 +460,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x*x;
-      ang_eval_1 = radial_eval*x*x*x*y;
-      ang_eval_2 = radial_eval*x*x*x*z;
-      ang_eval_3 = radial_eval*x*x*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = radial_eval*x6;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*x*y*z;
-      ang_eval_1 = radial_eval*x*x*z*z;
-      ang_eval_2 = radial_eval*x*y*y*y;
-      ang_eval_3 = radial_eval*x*y*y*z;
+      ang_eval_0 = x4*x7;
+      ang_eval_1 = radial_eval*x9;
+      ang_eval_2 = x10*x11;
+      ang_eval_3 = x12*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z*z;
-      ang_eval_1 = radial_eval*x*z*z*z;
-      ang_eval_2 = radial_eval*y*y*y*y;
-      ang_eval_3 = radial_eval*y*y*y*z;
+      ang_eval_0 = x13*x8;
+      ang_eval_1 = x10*x14;
+      ang_eval_2 = radial_eval*x15;
+      ang_eval_3 = x11*x3;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
       basis_eval[ipt + 10*npts] = ang_eval_2;
       basis_eval[ipt + 11*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*y*z*z;
-      ang_eval_1 = radial_eval*y*z*z*z;
-      ang_eval_2 = radial_eval*z*z*z*z;
+      ang_eval_0 = radial_eval*x16;
+      ang_eval_1 = x1*x14;
+      ang_eval_2 = radial_eval*x17;
       basis_eval[ipt + 12*npts] = ang_eval_0;
       basis_eval[ipt + 13*npts] = ang_eval_1;
       basis_eval[ipt + 14*npts] = ang_eval_2;
@@ -336,18 +500,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z;
-      dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z;
-      dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z;
-      dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*x18 + x19*x2;
+      dang_eval_y_0 = x0*x45;
+      dang_eval_z_0 = x0*x61;
+      dang_eval_x_1 = x21*y;
+      dang_eval_y_1 = x24 + x46;
+      dang_eval_z_1 = x47;
+      dang_eval_x_2 = x21*z;
+      dang_eval_y_2 = x47;
+      dang_eval_z_2 = x29 + x46;
+      dang_eval_x_3 = x22*x5 + x24;
+      dang_eval_y_3 = x32 + x4*x48;
+      dang_eval_z_3 = x6*x61;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -361,18 +525,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z;
-      dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z;
-      dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x25*x27;
+      dang_eval_y_0 = x50*z;
+      dang_eval_z_0 = y*(x37 + x49);
+      dang_eval_x_1 = x22*x8 + x29;
+      dang_eval_y_1 = x45*x9;
+      dang_eval_z_1 = x4*x62 + x41;
+      dang_eval_x_2 = x30 + x32;
+      dang_eval_y_2 = x*x51;
+      dang_eval_z_2 = x43;
+      dang_eval_x_3 = x35*z;
+      dang_eval_y_3 = x52*x54;
+      dang_eval_z_3 = x*(x33 + x55);
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -386,18 +550,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z;
-      dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y;
-      dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z;
-      dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z;
-      dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x38*y;
+      dang_eval_y_0 = x*(x36 + x55);
+      dang_eval_z_0 = x63*x65;
+      dang_eval_x_1 = x39 + x41;
+      dang_eval_y_1 = x44;
+      dang_eval_z_1 = x*x66;
+      dang_eval_x_2 = x15*x42;
+      dang_eval_y_2 = radial_eval_alpha*x56 + x11*x19;
+      dang_eval_z_2 = x15*x61;
+      dang_eval_x_3 = x43;
+      dang_eval_y_3 = x51*z;
+      dang_eval_z_3 = x30 + x58;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
@@ -411,15 +575,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 11*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 11*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z;
-      dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z;
-      dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z;
-      dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z;
-      dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x16*x42;
+      dang_eval_y_0 = x48*x8 + x58;
+      dang_eval_z_0 = x5*x62 + x60;
+      dang_eval_x_1 = x44;
+      dang_eval_y_1 = x39 + x60;
+      dang_eval_z_1 = x66*y;
+      dang_eval_x_2 = x17*x42;
+      dang_eval_y_2 = x17*x45;
+      dang_eval_z_2 = radial_eval_alpha*x67 + x14*x19;
       basis_x_eval[ipt + 12*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 12*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 12*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp
new file mode 100644
index 00000000..50bb788d
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp
@@ -0,0 +1,789 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_4(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = x*x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = x*x; 
+      const auto x5 = x4; 
+      const auto x6 = y*y; 
+      const auto x7 = x6; 
+      const auto x8 = x5*x7; 
+      const auto x9 = x1*z; 
+      const auto x10 = z*z; 
+      const auto x11 = x10; 
+      const auto x12 = x11*x5; 
+      const auto x13 = radial_eval*x; 
+      const auto x14 = y*y*y; 
+      const auto x15 = x*x3; 
+      const auto x16 = x*x1; 
+      const auto x17 = z*z*z; 
+      const auto x18 = y*y*y*y; 
+      const auto x19 = x11*x7; 
+      const auto x20 = z*z*z*z; 
+      const auto x21 = x*x*x*x*x; 
+      const auto x22 = 4.0*radial_eval; 
+      const auto x23 = 3.0*radial_eval; 
+      const auto x24 = radial_eval_alpha*x0 + x23*x5; 
+      const auto x25 = 2.0*x13; 
+      const auto x26 = x2*x7; 
+      const auto x27 = radial_eval_alpha*x26; 
+      const auto x28 = y*z; 
+      const auto x29 = radial_eval_alpha*x2; 
+      const auto x30 = x25 + x29; 
+      const auto x31 = x11*x2; 
+      const auto x32 = radial_eval_alpha*x31; 
+      const auto x33 = radial_eval*x14; 
+      const auto x34 = x14*x5; 
+      const auto x35 = radial_eval_alpha*x34; 
+      const auto x36 = radial_eval*x7; 
+      const auto x37 = radial_eval_alpha*x8; 
+      const auto x38 = x36 + x37; 
+      const auto x39 = radial_eval*x11; 
+      const auto x40 = radial_eval_alpha*x12; 
+      const auto x41 = x39 + x40; 
+      const auto x42 = radial_eval*x17; 
+      const auto x43 = x17*x5; 
+      const auto x44 = radial_eval_alpha*x43; 
+      const auto x45 = radial_eval_alpha*x; 
+      const auto x46 = x14*x45*z; 
+      const auto x47 = x17*x45*y; 
+      const auto x48 = radial_eval_alpha*y; 
+      const auto x49 = radial_eval*x2; 
+      const auto x50 = radial_eval_alpha*x2*x28; 
+      const auto x51 = 2.0*x1; 
+      const auto x52 = radial_eval*x5; 
+      const auto x53 = x37 + x52; 
+      const auto x54 = radial_eval_alpha*x18 + x23*x7; 
+      const auto x55 = x*z; 
+      const auto x56 = radial_eval_alpha*x14; 
+      const auto x57 = x51 + x56; 
+      const auto x58 = radial_eval_alpha*x19; 
+      const auto x59 = y*y*y*y*y; 
+      const auto x60 = x11*x14; 
+      const auto x61 = radial_eval_alpha*x60; 
+      const auto x62 = x17*x7; 
+      const auto x63 = radial_eval_alpha*x62; 
+      const auto x64 = radial_eval_alpha*z; 
+      const auto x65 = 2.0*x3; 
+      const auto x66 = x*y; 
+      const auto x67 = radial_eval_alpha*x17; 
+      const auto x68 = x65 + x67; 
+      const auto x69 = radial_eval_alpha*x20 + x11*x23; 
+      const auto x70 = z*z*z*z*z; 
+      const auto x71 = 12.0*radial_eval; 
+      const auto x72 = 8.0*radial_eval_alpha; 
+      const auto x73 = radial_eval_alpha + radial_eval_alpha_squared*x5; 
+      const auto x74 = x0*x72 + x0*x73 + x5*x71; 
+      const auto x75 = 6.0*radial_eval_alpha; 
+      const auto x76 = x2*x73; 
+      const auto x77 = 6.0*x13 + x76; 
+      const auto x78 = x2*x75 + x77; 
+      const auto x79 = 4.0*radial_eval_alpha; 
+      const auto x80 = x79*x8; 
+      const auto x81 = 2.0*radial_eval; 
+      const auto x82 = x7*x81; 
+      const auto x83 = x5*x7*x73 + x82; 
+      const auto x84 = x5*x73; 
+      const auto x85 = x81 + x84; 
+      const auto x86 = x12*x79; 
+      const auto x87 = x11*x81; 
+      const auto x88 = x11*x5*x73 + x87; 
+      const auto x89 = 2.0*radial_eval_alpha; 
+      const auto x90 = x14*x89; 
+      const auto x91 = x14*x73; 
+      const auto x92 = x7*x89; 
+      const auto x93 = x7*x73; 
+      const auto x94 = x11*x89; 
+      const auto x95 = x11*x73; 
+      const auto x96 = x17*x89; 
+      const auto x97 = x17*x73; 
+      const auto x98 = x18*x73; 
+      const auto x99 = x11*x7*x73; 
+      const auto x100 = x20*x73; 
+      const auto x101 = radial_eval_alpha_squared*x21 + x2*x79; 
+      const auto x102 = 3.0*radial_eval_alpha; 
+      const auto x103 = x102*x8; 
+      const auto x104 = x28*(radial_eval_alpha_squared*x0 + x102*x5); 
+      const auto x105 = 2.0*x45; 
+      const auto x106 = 2.0*x48; 
+      const auto x107 = x105*x7; 
+      const auto x108 = radial_eval_alpha_squared*x26; 
+      const auto x109 = x107 + x108; 
+      const auto x110 = x105*x11; 
+      const auto x111 = radial_eval_alpha_squared*x31; 
+      const auto x112 = x110 + x111; 
+      const auto x113 = x106*x5; 
+      const auto x114 = radial_eval_alpha_squared*x34; 
+      const auto x115 = x113 + x114; 
+      const auto x116 = radial_eval_alpha_squared*x11*x5*x7; 
+      const auto x117 = x116 + x58; 
+      const auto x118 = radial_eval_alpha_squared*x43; 
+      const auto x119 = radial_eval_alpha_squared*x59 + x14*x79; 
+      const auto x120 = x55*(radial_eval_alpha_squared*x18 + x102*x7); 
+      const auto x121 = x106*x11; 
+      const auto x122 = radial_eval_alpha_squared*x60; 
+      const auto x123 = x121 + x122; 
+      const auto x124 = radial_eval_alpha_squared*x62; 
+      const auto x125 = x102*x12; 
+      const auto x126 = 2.0*x64; 
+      const auto x127 = x126*x5; 
+      const auto x128 = x118 + x127; 
+      const auto x129 = x126*x7; 
+      const auto x130 = x124 + x129; 
+      const auto x131 = x66*(radial_eval_alpha_squared*x20 + x102*x11); 
+      const auto x132 = radial_eval_alpha_squared*x70 + x17*x79; 
+      const auto x133 = radial_eval_alpha + radial_eval_alpha_squared*x7; 
+      const auto x134 = x0*x133; 
+      const auto x135 = x2*x89; 
+      const auto x136 = x133*x2; 
+      const auto x137 = x5*x81; 
+      const auto x138 = x133*x5*x7 + x137; 
+      const auto x139 = x5*x89; 
+      const auto x140 = x133*x5; 
+      const auto x141 = x11*x133*x5; 
+      const auto x142 = x133*x14; 
+      const auto x143 = 6.0*x1 + x142; 
+      const auto x144 = x14*x75 + x143; 
+      const auto x145 = x133*x7; 
+      const auto x146 = x145 + x81; 
+      const auto x147 = x11*x133; 
+      const auto x148 = x133*x17; 
+      const auto x149 = x133*x18 + x18*x72 + x7*x71; 
+      const auto x150 = x19*x79; 
+      const auto x151 = x11*x133*x7 + x87; 
+      const auto x152 = x133*x20; 
+      const auto x153 = x102*x19; 
+      const auto x154 = radial_eval_alpha + radial_eval_alpha_squared*x11; 
+      const auto x155 = x0*x154; 
+      const auto x156 = x154*x2; 
+      const auto x157 = x154*x5*x7; 
+      const auto x158 = x154*x5; 
+      const auto x159 = x11*x154*x5 + x137; 
+      const auto x160 = x14*x154; 
+      const auto x161 = x154*x7; 
+      const auto x162 = x11*x154; 
+      const auto x163 = x162 + x81; 
+      const auto x164 = x154*x17; 
+      const auto x165 = x164 + 6.0*x3; 
+      const auto x166 = x165 + x17*x75; 
+      const auto x167 = x154*x18; 
+      const auto x168 = x11*x154*x7 + x82; 
+      const auto x169 = x11*x71 + x154*x20 + x20*x72; 
+      const auto x170 = x136 + x156 + x2*x72 + x77; 
+      const auto x171 = x158 + x85; 
+      const auto x172 = x14*x72 + x143 + x160 + x91; 
+      const auto x173 = x146 + x161; 
+      const auto x174 = x147 + x163; 
+      const auto x175 = x148 + x165 + x17*x72 + x97; 
+      const auto x176 = 36.0*radial_eval_alpha; 
+      const auto x177 = radial_eval_alpha_cubed*x7 + radial_eval_alpha_squared; 
+      const auto x178 = x0*x177; 
+      const auto x179 = radial_eval_alpha_cubed*x11 + radial_eval_alpha_squared; 
+      const auto x180 = x0*x179; 
+      const auto x181 = radial_eval_alpha_squared*x; 
+      const auto x182 = radial_eval_alpha_cubed*x2 + 3.0*x181; 
+      const auto x183 = 6.0*radial_eval; 
+      const auto x184 = 24.0*radial_eval_alpha; 
+      const auto x185 = 2.0*radial_eval_alpha_squared; 
+      const auto x186 = 3.0*x140; 
+      const auto x187 = 3.0*x158; 
+      const auto x188 = x177*x2; 
+      const auto x189 = x179*x2; 
+      const auto x190 = x*x188 + x*x189 + x0*x185 + x182*x2 + x183 + x184*x5 + x186 + x187 + 9.0*x84; 
+      const auto x191 = 2.0*x; 
+      const auto x192 = 4.0*radial_eval_alpha_squared; 
+      const auto x193 = 6.0*x; 
+      const auto x194 = 14.0*x45; 
+      const auto x195 = x177*x5*x7; 
+      const auto x196 = x179*x5*x7; 
+      const auto x197 = 4.0*x13 + x135; 
+      const auto x198 = x177*x5; 
+      const auto x199 = x179*x5; 
+      const auto x200 = x11*x177*x5; 
+      const auto x201 = x11*x179*x5; 
+      const auto x202 = x14*x182; 
+      const auto x203 = x14*x177; 
+      const auto x204 = x14*x179; 
+      const auto x205 = 6.0*x48; 
+      const auto x206 = 6.0*radial_eval_alpha_squared; 
+      const auto x207 = 3.0*x93; 
+      const auto x208 = x7*x75; 
+      const auto x209 = x177*x7; 
+      const auto x210 = x179*x7; 
+      const auto x211 = x206*x8; 
+      const auto x212 = 3.0*x95; 
+      const auto x213 = x11*x75; 
+      const auto x214 = x11*x177; 
+      const auto x215 = x11*x179; 
+      const auto x216 = x12*x206; 
+      const auto x217 = x17*x182; 
+      const auto x218 = x17*x177; 
+      const auto x219 = x17*x179; 
+      const auto x220 = 6.0*x64; 
+      const auto x221 = 12.0*x45; 
+      const auto x222 = 8.0*x181; 
+      const auto x223 = x177*x18; 
+      const auto x224 = x179*x18; 
+      const auto x225 = 6.0*y; 
+      const auto x226 = x225*x45; 
+      const auto x227 = x11*x177*x7; 
+      const auto x228 = x11*x179*x7; 
+      const auto x229 = 6.0*z; 
+      const auto x230 = x229*x45; 
+      const auto x231 = x177*x20; 
+      const auto x232 = x179*x20; 
+      const auto x233 = 12.0*x48; 
+      const auto x234 = radial_eval_alpha_squared*y; 
+      const auto x235 = 8.0*x234; 
+      const auto x236 = radial_eval_alpha_cubed*x5 + radial_eval_alpha_squared; 
+      const auto x237 = x0*x236; 
+      const auto x238 = radial_eval_alpha_cubed*x14 + 3.0*x234; 
+      const auto x239 = x2*x238; 
+      const auto x240 = x2*x236; 
+      const auto x241 = 6.0*x45; 
+      const auto x242 = 2.0*y; 
+      const auto x243 = 14.0*x48; 
+      const auto x244 = x236*x5*x7; 
+      const auto x245 = 4.0*x1 + x90; 
+      const auto x246 = x5*x75; 
+      const auto x247 = x236*x5; 
+      const auto x248 = x11*x236*x5; 
+      const auto x249 = 3.0*x161; 
+      const auto x250 = x14*x236; 
+      const auto x251 = x14*x238 + 9.0*x145 + x18*x185 + x183 + x184*x7 + x204*y + x207 + x249 + x250*y; 
+      const auto x252 = x236*x7; 
+      const auto x253 = 3.0*x147; 
+      const auto x254 = x11*x236; 
+      const auto x255 = x19*x206; 
+      const auto x256 = x28*x75; 
+      const auto x257 = x17*x236; 
+      const auto x258 = x17*x238; 
+      const auto x259 = x18*x236; 
+      const auto x260 = x11*x236*x7; 
+      const auto x261 = x20*x236; 
+      const auto x262 = 12.0*x64; 
+      const auto x263 = radial_eval_alpha_squared*z; 
+      const auto x264 = 8.0*x263; 
+      const auto x265 = radial_eval_alpha_cubed*x17 + 3.0*x263; 
+      const auto x266 = x2*x265; 
+      const auto x267 = 2.0*z; 
+      const auto x268 = 14.0*x64; 
+      const auto x269 = 4.0*x3 + x96; 
+      const auto x270 = x14*x265; 
+      const auto x271 = x11*x184 + 9.0*x162 + x17*x265 + x183 + x185*x20 + x212 + x218*z + x253 + x257*z; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = radial_eval*x8;
+      basis_eval[ipt + 4*npts] = x5*x9;
+      basis_eval[ipt + 5*npts] = radial_eval*x12;
+      basis_eval[ipt + 6*npts] = x13*x14;
+      basis_eval[ipt + 7*npts] = x15*x7;
+      basis_eval[ipt + 8*npts] = x11*x16;
+      basis_eval[ipt + 9*npts] = x13*x17;
+      basis_eval[ipt + 10*npts] = radial_eval*x18;
+      basis_eval[ipt + 11*npts] = x14*x3;
+      basis_eval[ipt + 12*npts] = radial_eval*x19;
+      basis_eval[ipt + 13*npts] = x1*x17;
+      basis_eval[ipt + 14*npts] = radial_eval*x20;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x21 + x2*x22;
+      basis_x_eval[ipt + 1*npts] = x24*y;
+      basis_x_eval[ipt + 2*npts] = x24*z;
+      basis_x_eval[ipt + 3*npts] = x25*x7 + x27;
+      basis_x_eval[ipt + 4*npts] = x28*x30;
+      basis_x_eval[ipt + 5*npts] = x11*x25 + x32;
+      basis_x_eval[ipt + 6*npts] = x33 + x35;
+      basis_x_eval[ipt + 7*npts] = x38*z;
+      basis_x_eval[ipt + 8*npts] = x41*y;
+      basis_x_eval[ipt + 9*npts] = x42 + x44;
+      basis_x_eval[ipt + 10*npts] = x18*x45;
+      basis_x_eval[ipt + 11*npts] = x46;
+      basis_x_eval[ipt + 12*npts] = x19*x45;
+      basis_x_eval[ipt + 13*npts] = x47;
+      basis_x_eval[ipt + 14*npts] = x20*x45;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x0*x48;
+      basis_y_eval[ipt + 1*npts] = x27 + x49;
+      basis_y_eval[ipt + 2*npts] = x50;
+      basis_y_eval[ipt + 3*npts] = x35 + x5*x51;
+      basis_y_eval[ipt + 4*npts] = x53*z;
+      basis_y_eval[ipt + 5*npts] = x12*x48;
+      basis_y_eval[ipt + 6*npts] = x*x54;
+      basis_y_eval[ipt + 7*npts] = x55*x57;
+      basis_y_eval[ipt + 8*npts] = x*(x39 + x58);
+      basis_y_eval[ipt + 9*npts] = x47;
+      basis_y_eval[ipt + 10*npts] = radial_eval_alpha*x59 + x14*x22;
+      basis_y_eval[ipt + 11*npts] = x54*z;
+      basis_y_eval[ipt + 12*npts] = x11*x51 + x61;
+      basis_y_eval[ipt + 13*npts] = x42 + x63;
+      basis_y_eval[ipt + 14*npts] = x20*x48;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x0*x64;
+      basis_z_eval[ipt + 1*npts] = x50;
+      basis_z_eval[ipt + 2*npts] = x32 + x49;
+      basis_z_eval[ipt + 3*npts] = x64*x8;
+      basis_z_eval[ipt + 4*npts] = y*(x40 + x52);
+      basis_z_eval[ipt + 5*npts] = x44 + x5*x65;
+      basis_z_eval[ipt + 6*npts] = x46;
+      basis_z_eval[ipt + 7*npts] = x*(x36 + x58);
+      basis_z_eval[ipt + 8*npts] = x66*x68;
+      basis_z_eval[ipt + 9*npts] = x*x69;
+      basis_z_eval[ipt + 10*npts] = x18*x64;
+      basis_z_eval[ipt + 11*npts] = x33 + x61;
+      basis_z_eval[ipt + 12*npts] = x63 + x65*x7;
+      basis_z_eval[ipt + 13*npts] = x69*y;
+      basis_z_eval[ipt + 14*npts] = radial_eval_alpha*x70 + x17*x22;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x74;
+      basis_xx_eval[ipt + 1*npts] = x78*y;
+      basis_xx_eval[ipt + 2*npts] = x78*z;
+      basis_xx_eval[ipt + 3*npts] = x80 + x83;
+      basis_xx_eval[ipt + 4*npts] = x28*(x5*x79 + x85);
+      basis_xx_eval[ipt + 5*npts] = x86 + x88;
+      basis_xx_eval[ipt + 6*npts] = x*(x90 + x91);
+      basis_xx_eval[ipt + 7*npts] = x55*(x92 + x93);
+      basis_xx_eval[ipt + 8*npts] = x66*(x94 + x95);
+      basis_xx_eval[ipt + 9*npts] = x*(x96 + x97);
+      basis_xx_eval[ipt + 10*npts] = x98;
+      basis_xx_eval[ipt + 11*npts] = x91*z;
+      basis_xx_eval[ipt + 12*npts] = x99;
+      basis_xx_eval[ipt + 13*npts] = x97*y;
+      basis_xx_eval[ipt + 14*npts] = x100;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x101*y;
+      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x7 + x103 + x24;
+      basis_xy_eval[ipt + 2*npts] = x104;
+      basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x14*x2 + x105*x14 + x106*x2 + 4.0*x16;
+      basis_xy_eval[ipt + 4*npts] = z*(x109 + x30);
+      basis_xy_eval[ipt + 5*npts] = x112*y;
+      basis_xy_eval[ipt + 6*npts] = radial_eval_alpha_squared*x18*x5 + x103 + x54;
+      basis_xy_eval[ipt + 7*npts] = z*(x115 + x57);
+      basis_xy_eval[ipt + 8*npts] = x117 + x41;
+      basis_xy_eval[ipt + 9*npts] = y*(x118 + x67);
+      basis_xy_eval[ipt + 10*npts] = x*x119;
+      basis_xy_eval[ipt + 11*npts] = x120;
+      basis_xy_eval[ipt + 12*npts] = x*x123;
+      basis_xy_eval[ipt + 13*npts] = x*(x124 + x67);
+      basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x20*x66;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x101*z;
+      basis_xz_eval[ipt + 1*npts] = x104;
+      basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x11 + x125 + x24;
+      basis_xz_eval[ipt + 3*npts] = x109*z;
+      basis_xz_eval[ipt + 4*npts] = y*(x112 + x30);
+      basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x17*x2 + x105*x17 + x126*x2 + 4.0*x15;
+      basis_xz_eval[ipt + 6*npts] = z*(x114 + x56);
+      basis_xz_eval[ipt + 7*npts] = x117 + x38;
+      basis_xz_eval[ipt + 8*npts] = y*(x128 + x68);
+      basis_xz_eval[ipt + 9*npts] = radial_eval_alpha_squared*x20*x5 + x125 + x69;
+      basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x18*x55;
+      basis_xz_eval[ipt + 11*npts] = x*(x122 + x56);
+      basis_xz_eval[ipt + 12*npts] = x*x130;
+      basis_xz_eval[ipt + 13*npts] = x131;
+      basis_xz_eval[ipt + 14*npts] = x*x132;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x134;
+      basis_yy_eval[ipt + 1*npts] = y*(x135 + x136);
+      basis_yy_eval[ipt + 2*npts] = x136*z;
+      basis_yy_eval[ipt + 3*npts] = x138 + x80;
+      basis_yy_eval[ipt + 4*npts] = x28*(x139 + x140);
+      basis_yy_eval[ipt + 5*npts] = x141;
+      basis_yy_eval[ipt + 6*npts] = x*x144;
+      basis_yy_eval[ipt + 7*npts] = x55*(x146 + x7*x79);
+      basis_yy_eval[ipt + 8*npts] = x66*(x147 + x94);
+      basis_yy_eval[ipt + 9*npts] = x*x148;
+      basis_yy_eval[ipt + 10*npts] = x149;
+      basis_yy_eval[ipt + 11*npts] = x144*z;
+      basis_yy_eval[ipt + 12*npts] = x150 + x151;
+      basis_yy_eval[ipt + 13*npts] = y*(x148 + x96);
+      basis_yy_eval[ipt + 14*npts] = x152;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x28;
+      basis_yz_eval[ipt + 1*npts] = z*(x108 + x29);
+      basis_yz_eval[ipt + 2*npts] = y*(x111 + x29);
+      basis_yz_eval[ipt + 3*npts] = x115*z;
+      basis_yz_eval[ipt + 4*npts] = x116 + x40 + x53;
+      basis_yz_eval[ipt + 5*npts] = x128*y;
+      basis_yz_eval[ipt + 6*npts] = x120;
+      basis_yz_eval[ipt + 7*npts] = x*(x123 + x57);
+      basis_yz_eval[ipt + 8*npts] = x*(x130 + x68);
+      basis_yz_eval[ipt + 9*npts] = x131;
+      basis_yz_eval[ipt + 10*npts] = x119*z;
+      basis_yz_eval[ipt + 11*npts] = radial_eval_alpha_squared*x11*x18 + x153 + x54;
+      basis_yz_eval[ipt + 12*npts] = radial_eval_alpha_squared*x14*x17 + x106*x17 + x126*x14 + 4.0*x9;
+      basis_yz_eval[ipt + 13*npts] = radial_eval_alpha_squared*x20*x7 + x153 + x69;
+      basis_yz_eval[ipt + 14*npts] = x132*y;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x155;
+      basis_zz_eval[ipt + 1*npts] = x156*y;
+      basis_zz_eval[ipt + 2*npts] = z*(x135 + x156);
+      basis_zz_eval[ipt + 3*npts] = x157;
+      basis_zz_eval[ipt + 4*npts] = x28*(x139 + x158);
+      basis_zz_eval[ipt + 5*npts] = x159 + x86;
+      basis_zz_eval[ipt + 6*npts] = x*x160;
+      basis_zz_eval[ipt + 7*npts] = x55*(x161 + x92);
+      basis_zz_eval[ipt + 8*npts] = x66*(x11*x79 + x163);
+      basis_zz_eval[ipt + 9*npts] = x*x166;
+      basis_zz_eval[ipt + 10*npts] = x167;
+      basis_zz_eval[ipt + 11*npts] = z*(x160 + x90);
+      basis_zz_eval[ipt + 12*npts] = x150 + x168;
+      basis_zz_eval[ipt + 13*npts] = x166*y;
+      basis_zz_eval[ipt + 14*npts] = x169;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x134 + x155 + x74;
+      basis_lapl_eval[ipt + 1*npts] = x170*y;
+      basis_lapl_eval[ipt + 2*npts] = x170*z;
+      basis_lapl_eval[ipt + 3*npts] = x138 + x157 + x72*x8 + x83;
+      basis_lapl_eval[ipt + 4*npts] = x28*(x140 + x171 + x5*x72);
+      basis_lapl_eval[ipt + 5*npts] = x12*x72 + x141 + x159 + x88;
+      basis_lapl_eval[ipt + 6*npts] = x*x172;
+      basis_lapl_eval[ipt + 7*npts] = x55*(x173 + x7*x72 + x93);
+      basis_lapl_eval[ipt + 8*npts] = x66*(x11*x72 + x174 + x95);
+      basis_lapl_eval[ipt + 9*npts] = x*x175;
+      basis_lapl_eval[ipt + 10*npts] = x149 + x167 + x98;
+      basis_lapl_eval[ipt + 11*npts] = x172*z;
+      basis_lapl_eval[ipt + 12*npts] = x151 + x168 + x19*x72 + x99;
+      basis_lapl_eval[ipt + 13*npts] = x175*y;
+      basis_lapl_eval[ipt + 14*npts] = x100 + x152 + x169;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x*x178 + x*x180 + x0*x182 + 24.0*x13 + 4.0*x136 + 4.0*x156 + x176*x2 + 12.0*x76;
+      basis_lapl_x_eval[ipt + 1*npts] = x190*y;
+      basis_lapl_x_eval[ipt + 2*npts] = x190*z;
+      basis_lapl_x_eval[ipt + 3*npts] = x*x195 + x*x196 + x145*x191 + x161*x191 + x182*x5*x7 + x192*x26 + x193*x93 + x194*x7 + x197;
+      basis_lapl_x_eval[ipt + 4*npts] = x28*(x*x198 + x*x199 + x133*x191 + x154*x191 + x182*x5 + x192*x2 + x193*x73 + x194);
+      basis_lapl_x_eval[ipt + 5*npts] = x*x200 + x*x201 + x11*x182*x5 + x11*x194 + x147*x191 + x162*x191 + x192*x31 + x193*x95 + x197;
+      basis_lapl_x_eval[ipt + 6*npts] = x*x202 + x144 + x160 + x203*x4 + x204*x4 + x205*x5 + x206*x34 + 3.0*x91;
+      basis_lapl_x_eval[ipt + 7*npts] = z*(x*x182*x7 + x139 + x173 + x207 + x208 + x209*x4 + x210*x4 + x211);
+      basis_lapl_x_eval[ipt + 8*npts] = y*(x*x11*x182 + x139 + x174 + x212 + x213 + x214*x4 + x215*x4 + x216);
+      basis_lapl_x_eval[ipt + 9*npts] = x*x217 + x148 + x166 + x206*x43 + x218*x4 + x219*x4 + x220*x5 + 3.0*x97;
+      basis_lapl_x_eval[ipt + 10*npts] = x*x223 + x*x224 + x18*x182 + x18*x222 + x221*x7;
+      basis_lapl_x_eval[ipt + 11*npts] = z*(x*x203 + x*x204 + x14*x222 + x202 + x226);
+      basis_lapl_x_eval[ipt + 12*npts] = x*x227 + x*x228 + x107 + x11*x182*x7 + x110 + x19*x222;
+      basis_lapl_x_eval[ipt + 13*npts] = y*(x*x218 + x*x219 + x17*x222 + x217 + x230);
+      basis_lapl_x_eval[ipt + 14*npts] = x*x231 + x*x232 + x11*x221 + x182*x20 + x20*x222;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x0*x235 + x0*x238 + x180*y + x233*x5 + x237*y;
+      basis_lapl_y_eval[ipt + 1*npts] = 3.0*x136 + x156 + x189*x6 + x206*x26 + x239*y + x240*x6 + x241*x7 + x78;
+      basis_lapl_y_eval[ipt + 2*npts] = z*(x189*y + x2*x235 + x226 + x239 + x240*y);
+      basis_lapl_y_eval[ipt + 3*npts] = x140*x225 + x158*x242 + x192*x34 + x196*y + x238*x5*x7 + x242*x84 + x243*x5 + x244*y + x245;
+      basis_lapl_y_eval[ipt + 4*npts] = z*(x171 + x186 + x199*x6 + x211 + x238*x5*y + x246 + x247*x6 + x92);
+      basis_lapl_y_eval[ipt + 5*npts] = x11*x238*x5 + x113 + x12*x235 + x121 + x201*y + x248*y;
+      basis_lapl_y_eval[ipt + 6*npts] = x*x251;
+      basis_lapl_y_eval[ipt + 7*npts] = x55*(x133*x225 + x14*x192 + x154*x242 + x210*y + x238*x7 + x242*x73 + x243 + x252*y);
+      basis_lapl_y_eval[ipt + 8*npts] = x*(x11*x238*y + x163 + x213 + x215*x6 + x253 + x254*x6 + x255 + x92 + x95);
+      basis_lapl_y_eval[ipt + 9*npts] = x*(x17*x235 + x219*y + x256 + x257*y + x258);
+      basis_lapl_y_eval[ipt + 10*npts] = 24.0*x1 + x14*x176 + 12.0*x142 + 4.0*x160 + x18*x238 + x224*y + x259*y + 4.0*x91;
+      basis_lapl_y_eval[ipt + 11*npts] = x251*z;
+      basis_lapl_y_eval[ipt + 12*npts] = x11*x238*x7 + x11*x243 + x147*x225 + x162*x242 + x192*x60 + x228*y + x242*x95 + x245 + x260*y;
+      basis_lapl_y_eval[ipt + 13*npts] = 3.0*x148 + x166 + x206*x62 + x219*x6 + x220*x7 + x257*x6 + x258*y + x97;
+      basis_lapl_y_eval[ipt + 14*npts] = x11*x233 + x20*x235 + x20*x238 + x232*y + x261*y;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x0*x264 + x0*x265 + x178*z + x237*z + x262*x5;
+      basis_lapl_z_eval[ipt + 1*npts] = y*(x188*z + x2*x264 + x230 + x240*z + x266);
+      basis_lapl_z_eval[ipt + 2*npts] = x10*x188 + x10*x240 + x11*x241 + x136 + 3.0*x156 + x206*x31 + x266*z + x78;
+      basis_lapl_z_eval[ipt + 3*npts] = x127 + x129 + x195*z + x244*z + x264*x8 + x265*x5*x7;
+      basis_lapl_z_eval[ipt + 4*npts] = y*(x10*x198 + x10*x247 + x140 + x187 + x216 + x246 + x265*x5*z + x85 + x94);
+      basis_lapl_z_eval[ipt + 5*npts] = x11*x265*x5 + x140*x267 + x158*x229 + x192*x43 + x200*z + x248*z + x267*x84 + x268*x5 + x269;
+      basis_lapl_z_eval[ipt + 6*npts] = x*(x14*x264 + x203*z + x250*z + x256 + x270);
+      basis_lapl_z_eval[ipt + 7*npts] = x*(x10*x209 + x10*x252 + x146 + x208 + x249 + x255 + x265*x7*z + x93 + x94);
+      basis_lapl_z_eval[ipt + 8*npts] = x66*(x11*x265 + x133*x267 + x154*x229 + x17*x192 + x214*z + x254*z + x267*x73 + x268);
+      basis_lapl_z_eval[ipt + 9*npts] = x*x271;
+      basis_lapl_z_eval[ipt + 10*npts] = x18*x264 + x18*x265 + x223*z + x259*z + x262*x7;
+      basis_lapl_z_eval[ipt + 11*npts] = x10*x203 + x10*x250 + x11*x205 + x144 + 3.0*x160 + x206*x60 + x270*z + x91;
+      basis_lapl_z_eval[ipt + 12*npts] = x11*x265*x7 + x145*x267 + x161*x229 + x192*x62 + x227*z + x260*z + x267*x93 + x268*x7 + x269;
+      basis_lapl_z_eval[ipt + 13*npts] = x271*y;
+      basis_lapl_z_eval[ipt + 14*npts] = 4.0*x148 + 12.0*x164 + x17*x176 + x20*x265 + x231*z + x261*z + 24.0*x3 + 4.0*x97;
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+      double ang_eval_3;
+
+
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = radial_eval*x8;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+      basis_eval[ipt + 3*npts] = ang_eval_3;
+
+      ang_eval_0 = x5*x9;
+      ang_eval_1 = radial_eval*x12;
+      ang_eval_2 = x13*x14;
+      ang_eval_3 = x15*x7;
+      basis_eval[ipt + 4*npts] = ang_eval_0;
+      basis_eval[ipt + 5*npts] = ang_eval_1;
+      basis_eval[ipt + 6*npts] = ang_eval_2;
+      basis_eval[ipt + 7*npts] = ang_eval_3;
+
+      ang_eval_0 = x11*x16;
+      ang_eval_1 = x13*x17;
+      ang_eval_2 = radial_eval*x18;
+      ang_eval_3 = x14*x3;
+      basis_eval[ipt + 8*npts] = ang_eval_0;
+      basis_eval[ipt + 9*npts] = ang_eval_1;
+      basis_eval[ipt + 10*npts] = ang_eval_2;
+      basis_eval[ipt + 11*npts] = ang_eval_3;
+
+      ang_eval_0 = radial_eval*x19;
+      ang_eval_1 = x1*x17;
+      ang_eval_2 = radial_eval*x20;
+      basis_eval[ipt + 12*npts] = ang_eval_0;
+      basis_eval[ipt + 13*npts] = ang_eval_1;
+      basis_eval[ipt + 14*npts] = ang_eval_2;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+      double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
+
+      dang_eval_x_0 = radial_eval_alpha*x21 + x2*x22;
+      dang_eval_y_0 = x0*x48;
+      dang_eval_z_0 = x0*x64;
+      dang_eval_x_1 = x24*y;
+      dang_eval_y_1 = x27 + x49;
+      dang_eval_z_1 = x50;
+      dang_eval_x_2 = x24*z;
+      dang_eval_y_2 = x50;
+      dang_eval_z_2 = x32 + x49;
+      dang_eval_x_3 = x25*x7 + x27;
+      dang_eval_y_3 = x35 + x5*x51;
+      dang_eval_z_3 = x64*x8;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 3*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x28*x30;
+      dang_eval_y_0 = x53*z;
+      dang_eval_z_0 = y*(x40 + x52);
+      dang_eval_x_1 = x11*x25 + x32;
+      dang_eval_y_1 = x12*x48;
+      dang_eval_z_1 = x44 + x5*x65;
+      dang_eval_x_2 = x33 + x35;
+      dang_eval_y_2 = x*x54;
+      dang_eval_z_2 = x46;
+      dang_eval_x_3 = x38*z;
+      dang_eval_y_3 = x55*x57;
+      dang_eval_z_3 = x*(x36 + x58);
+      basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 5*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 5*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 5*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 6*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 6*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 6*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 7*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x41*y;
+      dang_eval_y_0 = x*(x39 + x58);
+      dang_eval_z_0 = x66*x68;
+      dang_eval_x_1 = x42 + x44;
+      dang_eval_y_1 = x47;
+      dang_eval_z_1 = x*x69;
+      dang_eval_x_2 = x18*x45;
+      dang_eval_y_2 = radial_eval_alpha*x59 + x14*x22;
+      dang_eval_z_2 = x18*x64;
+      dang_eval_x_3 = x46;
+      dang_eval_y_3 = x54*z;
+      dang_eval_z_3 = x33 + x61;
+      basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 9*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 9*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 9*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 10*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 10*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 10*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 11*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 11*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 11*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x19*x45;
+      dang_eval_y_0 = x11*x51 + x61;
+      dang_eval_z_0 = x63 + x65*x7;
+      dang_eval_x_1 = x47;
+      dang_eval_y_1 = x42 + x63;
+      dang_eval_z_1 = x69*y;
+      dang_eval_x_2 = x20*x45;
+      dang_eval_y_2 = x20*x48;
+      dang_eval_z_2 = radial_eval_alpha*x70 + x17*x22;
+      basis_x_eval[ipt + 12*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 12*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 12*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 13*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 13*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 13*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 14*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 14*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 14*npts] = dang_eval_z_2;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp
index cbd77a2e..52f08f34 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_4(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_4(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,94 +106,255 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = x*x*x*x; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x*x; 
+      const auto x3 = radial_eval*z; 
+      const auto x4 = x*x; 
+      const auto x5 = y*y; 
+      const auto x6 = x4*x5; 
+      const auto x7 = x1*z; 
+      const auto x8 = z*z; 
+      const auto x9 = x4*x8; 
+      const auto x10 = radial_eval*x; 
+      const auto x11 = y*y*y; 
+      const auto x12 = x*x3; 
+      const auto x13 = x*x1; 
+      const auto x14 = z*z*z; 
+      const auto x15 = y*y*y*y; 
+      const auto x16 = x5*x8; 
+      const auto x17 = z*z*z*z; 
+      const auto x18 = x*x*x*x*x; 
+      const auto x19 = 4.0*radial_eval; 
+      const auto x20 = 3.0*radial_eval; 
+      const auto x21 = radial_eval_alpha*x0 + x20*x4; 
+      const auto x22 = 2.0*x10; 
+      const auto x23 = x2*x5; 
+      const auto x24 = radial_eval_alpha*x23; 
+      const auto x25 = y*z; 
+      const auto x26 = radial_eval_alpha*x2; 
+      const auto x27 = x22 + x26; 
+      const auto x28 = x2*x8; 
+      const auto x29 = radial_eval_alpha*x28; 
+      const auto x30 = radial_eval*x11; 
+      const auto x31 = x11*x4; 
+      const auto x32 = radial_eval_alpha*x31; 
+      const auto x33 = radial_eval*x5; 
+      const auto x34 = radial_eval_alpha*x6; 
+      const auto x35 = x33 + x34; 
+      const auto x36 = radial_eval*x8; 
+      const auto x37 = radial_eval_alpha*x9; 
+      const auto x38 = x36 + x37; 
+      const auto x39 = radial_eval*x14; 
+      const auto x40 = x14*x4; 
+      const auto x41 = radial_eval_alpha*x40; 
+      const auto x42 = radial_eval_alpha*x; 
+      const auto x43 = x11*x42*z; 
+      const auto x44 = x14*x42*y; 
+      const auto x45 = radial_eval_alpha*y; 
+      const auto x46 = radial_eval*x2; 
+      const auto x47 = radial_eval_alpha*x2*x25; 
+      const auto x48 = 2.0*x1; 
+      const auto x49 = radial_eval*x4; 
+      const auto x50 = x34 + x49; 
+      const auto x51 = radial_eval_alpha*x15 + x20*x5; 
+      const auto x52 = x*z; 
+      const auto x53 = radial_eval_alpha*x11; 
+      const auto x54 = x48 + x53; 
+      const auto x55 = radial_eval_alpha*x16; 
+      const auto x56 = y*y*y*y*y; 
+      const auto x57 = x11*x8; 
+      const auto x58 = radial_eval_alpha*x57; 
+      const auto x59 = x14*x5; 
+      const auto x60 = radial_eval_alpha*x59; 
+      const auto x61 = radial_eval_alpha*z; 
+      const auto x62 = 2.0*x3; 
+      const auto x63 = x*y; 
+      const auto x64 = radial_eval_alpha*x14; 
+      const auto x65 = x62 + x64; 
+      const auto x66 = radial_eval_alpha*x17 + x20*x8; 
+      const auto x67 = z*z*z*z*z; 
+      const auto x68 = 12.0*radial_eval; 
+      const auto x69 = 8.0*radial_eval_alpha; 
+      const auto x70 = radial_eval_alpha + radial_eval_alpha_squared*x4; 
+      const auto x71 = x0*x69 + x0*x70 + x4*x68; 
+      const auto x72 = 6.0*radial_eval_alpha; 
+      const auto x73 = 6.0*x10 + x2*x70; 
+      const auto x74 = x2*x72 + x73; 
+      const auto x75 = 4.0*radial_eval_alpha; 
+      const auto x76 = x6*x75; 
+      const auto x77 = 2.0*radial_eval; 
+      const auto x78 = x5*x77; 
+      const auto x79 = x4*x5*x70 + x78; 
+      const auto x80 = x4*x70 + x77; 
+      const auto x81 = x75*x9; 
+      const auto x82 = x77*x8; 
+      const auto x83 = x4*x70*x8 + x82; 
+      const auto x84 = 2.0*radial_eval_alpha; 
+      const auto x85 = x11*x84; 
+      const auto x86 = x11*x70; 
+      const auto x87 = x5*x84; 
+      const auto x88 = x5*x70; 
+      const auto x89 = x8*x84; 
+      const auto x90 = x70*x8; 
+      const auto x91 = x14*x84; 
+      const auto x92 = x14*x70; 
+      const auto x93 = x15*x70; 
+      const auto x94 = x5*x70*x8; 
+      const auto x95 = x17*x70; 
+      const auto x96 = radial_eval_alpha_squared*x18 + x2*x75; 
+      const auto x97 = 3.0*radial_eval_alpha; 
+      const auto x98 = x6*x97; 
+      const auto x99 = x25*(radial_eval_alpha_squared*x0 + x4*x97); 
+      const auto x100 = 2.0*x42; 
+      const auto x101 = 2.0*x45; 
+      const auto x102 = radial_eval_alpha_squared*x23; 
+      const auto x103 = x100*x5 + x102; 
+      const auto x104 = radial_eval_alpha_squared*x28; 
+      const auto x105 = x100*x8 + x104; 
+      const auto x106 = radial_eval_alpha_squared*x31; 
+      const auto x107 = x101*x4 + x106; 
+      const auto x108 = radial_eval_alpha_squared*x4*x5*x8; 
+      const auto x109 = x108 + x55; 
+      const auto x110 = radial_eval_alpha_squared*x40; 
+      const auto x111 = radial_eval_alpha_squared*x56 + x11*x75; 
+      const auto x112 = x52*(radial_eval_alpha_squared*x15 + x5*x97); 
+      const auto x113 = radial_eval_alpha_squared*x57; 
+      const auto x114 = x101*x8 + x113; 
+      const auto x115 = radial_eval_alpha_squared*x59; 
+      const auto x116 = x9*x97; 
+      const auto x117 = 2.0*x61; 
+      const auto x118 = x110 + x117*x4; 
+      const auto x119 = x115 + x117*x5; 
+      const auto x120 = x63*(radial_eval_alpha_squared*x17 + x8*x97); 
+      const auto x121 = radial_eval_alpha_squared*x67 + x14*x75; 
+      const auto x122 = radial_eval_alpha + radial_eval_alpha_squared*x5; 
+      const auto x123 = x0*x122; 
+      const auto x124 = x2*x84; 
+      const auto x125 = x122*x2; 
+      const auto x126 = x4*x77; 
+      const auto x127 = x122*x4*x5 + x126; 
+      const auto x128 = x4*x84; 
+      const auto x129 = x122*x4; 
+      const auto x130 = x122*x4*x8; 
+      const auto x131 = 6.0*x1 + x11*x122; 
+      const auto x132 = x11*x72 + x131; 
+      const auto x133 = x122*x5 + x77; 
+      const auto x134 = x122*x8; 
+      const auto x135 = x122*x14; 
+      const auto x136 = x122*x15 + x15*x69 + x5*x68; 
+      const auto x137 = x16*x75; 
+      const auto x138 = x122*x5*x8 + x82; 
+      const auto x139 = x122*x17; 
+      const auto x140 = x16*x97; 
+      const auto x141 = radial_eval_alpha + radial_eval_alpha_squared*x8; 
+      const auto x142 = x0*x141; 
+      const auto x143 = x141*x2; 
+      const auto x144 = x141*x4*x5; 
+      const auto x145 = x141*x4; 
+      const auto x146 = x126 + x141*x4*x8; 
+      const auto x147 = x11*x141; 
+      const auto x148 = x141*x5; 
+      const auto x149 = x141*x8 + x77; 
+      const auto x150 = x14*x141 + 6.0*x3; 
+      const auto x151 = x14*x72 + x150; 
+      const auto x152 = x141*x15; 
+      const auto x153 = x141*x5*x8 + x78; 
+      const auto x154 = x141*x17 + x17*x69 + x68*x8; 
+      const auto x155 = x125 + x143 + x2*x69 + x73; 
+      const auto x156 = x11*x69 + x131 + x147 + x86; 
+      const auto x157 = x135 + x14*x69 + x150 + x92; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x;
-      basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y;
-      basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z;
-      basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y;
-      basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z;
-      basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z;
-      basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y;
-      basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z;
-      basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z;
-      basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z;
-      basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y;
-      basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z;
-      basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z;
-      basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z;
-      basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z;
+      basis_eval[ipt + 0*npts] = radial_eval*x0;
+      basis_eval[ipt + 1*npts] = x1*x2;
+      basis_eval[ipt + 2*npts] = x2*x3;
+      basis_eval[ipt + 3*npts] = radial_eval*x6;
+      basis_eval[ipt + 4*npts] = x4*x7;
+      basis_eval[ipt + 5*npts] = radial_eval*x9;
+      basis_eval[ipt + 6*npts] = x10*x11;
+      basis_eval[ipt + 7*npts] = x12*x5;
+      basis_eval[ipt + 8*npts] = x13*x8;
+      basis_eval[ipt + 9*npts] = x10*x14;
+      basis_eval[ipt + 10*npts] = radial_eval*x15;
+      basis_eval[ipt + 11*npts] = x11*x3;
+      basis_eval[ipt + 12*npts] = radial_eval*x16;
+      basis_eval[ipt + 13*npts] = x1*x14;
+      basis_eval[ipt + 14*npts] = radial_eval*x17;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y;
-      basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z;
-      basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z;
-      basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z;
-      basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z;
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x18 + x19*x2;
+      basis_x_eval[ipt + 1*npts] = x21*y;
+      basis_x_eval[ipt + 2*npts] = x21*z;
+      basis_x_eval[ipt + 3*npts] = x22*x5 + x24;
+      basis_x_eval[ipt + 4*npts] = x25*x27;
+      basis_x_eval[ipt + 5*npts] = x22*x8 + x29;
+      basis_x_eval[ipt + 6*npts] = x30 + x32;
+      basis_x_eval[ipt + 7*npts] = x35*z;
+      basis_x_eval[ipt + 8*npts] = x38*y;
+      basis_x_eval[ipt + 9*npts] = x39 + x41;
+      basis_x_eval[ipt + 10*npts] = x15*x42;
+      basis_x_eval[ipt + 11*npts] = x43;
+      basis_x_eval[ipt + 12*npts] = x16*x42;
+      basis_x_eval[ipt + 13*npts] = x44;
+      basis_x_eval[ipt + 14*npts] = x17*x42;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y;
-      basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z;
-      basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z;
-      basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z;
-      basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z;
+      basis_y_eval[ipt + 0*npts] = x0*x45;
+      basis_y_eval[ipt + 1*npts] = x24 + x46;
+      basis_y_eval[ipt + 2*npts] = x47;
+      basis_y_eval[ipt + 3*npts] = x32 + x4*x48;
+      basis_y_eval[ipt + 4*npts] = x50*z;
+      basis_y_eval[ipt + 5*npts] = x45*x9;
+      basis_y_eval[ipt + 6*npts] = x*x51;
+      basis_y_eval[ipt + 7*npts] = x52*x54;
+      basis_y_eval[ipt + 8*npts] = x*(x36 + x55);
+      basis_y_eval[ipt + 9*npts] = x44;
+      basis_y_eval[ipt + 10*npts] = radial_eval_alpha*x56 + x11*x19;
+      basis_y_eval[ipt + 11*npts] = x51*z;
+      basis_y_eval[ipt + 12*npts] = x48*x8 + x58;
+      basis_y_eval[ipt + 13*npts] = x39 + x60;
+      basis_y_eval[ipt + 14*npts] = x17*x45;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z;
-      basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z;
-      basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z;
-      basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z;
-      basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z);
+      basis_z_eval[ipt + 0*npts] = x0*x61;
+      basis_z_eval[ipt + 1*npts] = x47;
+      basis_z_eval[ipt + 2*npts] = x29 + x46;
+      basis_z_eval[ipt + 3*npts] = x6*x61;
+      basis_z_eval[ipt + 4*npts] = y*(x37 + x49);
+      basis_z_eval[ipt + 5*npts] = x4*x62 + x41;
+      basis_z_eval[ipt + 6*npts] = x43;
+      basis_z_eval[ipt + 7*npts] = x*(x33 + x55);
+      basis_z_eval[ipt + 8*npts] = x63*x65;
+      basis_z_eval[ipt + 9*npts] = x*x66;
+      basis_z_eval[ipt + 10*npts] = x15*x61;
+      basis_z_eval[ipt + 11*npts] = x30 + x58;
+      basis_z_eval[ipt + 12*npts] = x5*x62 + x60;
+      basis_z_eval[ipt + 13*npts] = x66*y;
+      basis_z_eval[ipt + 14*npts] = radial_eval_alpha*x67 + x14*x19;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = x*x*(12*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 1*npts] = x*y*(6*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 2*npts] = x*z*(6*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 3*npts] = x*x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z) + x*x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y) + y*y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_lapl_eval[ipt + 4*npts] = y*z*(2*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z);
-      basis_lapl_eval[ipt + 5*npts] = x*x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y) + x*x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z) + z*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x);
-      basis_lapl_eval[ipt + 6*npts] = x*y*(6*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 7*npts] = x*z*(2*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 8*npts] = x*y*(2*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_lapl_eval[ipt + 9*npts] = x*z*(6*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_lapl_eval[ipt + 10*npts] = y*y*(12*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 11*npts] = y*z*(6*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z);
-      basis_lapl_eval[ipt + 12*npts] = y*y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x) + y*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z) + z*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y);
-      basis_lapl_eval[ipt + 13*npts] = y*z*(6*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
-      basis_lapl_eval[ipt + 14*npts] = z*z*(12*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z);
+      basis_lapl_eval[ipt + 0*npts] = x123 + x142 + x71;
+      basis_lapl_eval[ipt + 1*npts] = x155*y;
+      basis_lapl_eval[ipt + 2*npts] = x155*z;
+      basis_lapl_eval[ipt + 3*npts] = x127 + x144 + x6*x69 + x79;
+      basis_lapl_eval[ipt + 4*npts] = x25*(x129 + x145 + x4*x69 + x80);
+      basis_lapl_eval[ipt + 5*npts] = x130 + x146 + x69*x9 + x83;
+      basis_lapl_eval[ipt + 6*npts] = x*x156;
+      basis_lapl_eval[ipt + 7*npts] = x52*(x133 + x148 + x5*x69 + x88);
+      basis_lapl_eval[ipt + 8*npts] = x63*(x134 + x149 + x69*x8 + x90);
+      basis_lapl_eval[ipt + 9*npts] = x*x157;
+      basis_lapl_eval[ipt + 10*npts] = x136 + x152 + x93;
+      basis_lapl_eval[ipt + 11*npts] = x156*z;
+      basis_lapl_eval[ipt + 12*npts] = x138 + x153 + x16*x69 + x94;
+      basis_lapl_eval[ipt + 13*npts] = x157*y;
+      basis_lapl_eval[ipt + 14*npts] = x139 + x154 + x95;
+
 
 
 
@@ -206,36 +370,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = radial_eval*x*x*x*x;
-      ang_eval_1 = radial_eval*x*x*x*y;
-      ang_eval_2 = radial_eval*x*x*x*z;
-      ang_eval_3 = radial_eval*x*x*y*y;
+      ang_eval_0 = radial_eval*x0;
+      ang_eval_1 = x1*x2;
+      ang_eval_2 = x2*x3;
+      ang_eval_3 = radial_eval*x6;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*x*y*z;
-      ang_eval_1 = radial_eval*x*x*z*z;
-      ang_eval_2 = radial_eval*x*y*y*y;
-      ang_eval_3 = radial_eval*x*y*y*z;
+      ang_eval_0 = x4*x7;
+      ang_eval_1 = radial_eval*x9;
+      ang_eval_2 = x10*x11;
+      ang_eval_3 = x12*x5;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*x*y*z*z;
-      ang_eval_1 = radial_eval*x*z*z*z;
-      ang_eval_2 = radial_eval*y*y*y*y;
-      ang_eval_3 = radial_eval*y*y*y*z;
+      ang_eval_0 = x13*x8;
+      ang_eval_1 = x10*x14;
+      ang_eval_2 = radial_eval*x15;
+      ang_eval_3 = x11*x3;
       basis_eval[ipt + 8*npts] = ang_eval_0;
       basis_eval[ipt + 9*npts] = ang_eval_1;
       basis_eval[ipt + 10*npts] = ang_eval_2;
       basis_eval[ipt + 11*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*y*y*z*z;
-      ang_eval_1 = radial_eval*y*z*z*z;
-      ang_eval_2 = radial_eval*z*z*z*z;
+      ang_eval_0 = radial_eval*x16;
+      ang_eval_1 = x1*x14;
+      ang_eval_2 = radial_eval*x17;
       basis_eval[ipt + 12*npts] = ang_eval_0;
       basis_eval[ipt + 13*npts] = ang_eval_1;
       basis_eval[ipt + 14*npts] = ang_eval_2;
@@ -246,18 +410,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y;
-      dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z;
-      dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z;
-      dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z;
-      dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z;
+      dang_eval_x_0 = radial_eval_alpha*x18 + x19*x2;
+      dang_eval_y_0 = x0*x45;
+      dang_eval_z_0 = x0*x61;
+      dang_eval_x_1 = x21*y;
+      dang_eval_y_1 = x24 + x46;
+      dang_eval_z_1 = x47;
+      dang_eval_x_2 = x21*z;
+      dang_eval_y_2 = x47;
+      dang_eval_z_2 = x29 + x46;
+      dang_eval_x_3 = x22*x5 + x24;
+      dang_eval_y_3 = x32 + x4*x48;
+      dang_eval_z_3 = x6*x61;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -271,18 +435,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z;
-      dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z;
-      dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x25*x27;
+      dang_eval_y_0 = x50*z;
+      dang_eval_z_0 = y*(x37 + x49);
+      dang_eval_x_1 = x22*x8 + x29;
+      dang_eval_y_1 = x45*x9;
+      dang_eval_z_1 = x4*x62 + x41;
+      dang_eval_x_2 = x30 + x32;
+      dang_eval_y_2 = x*x51;
+      dang_eval_z_2 = x43;
+      dang_eval_x_3 = x35*z;
+      dang_eval_y_3 = x52*x54;
+      dang_eval_z_3 = x*(x33 + x55);
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -296,18 +460,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z;
-      dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y;
-      dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z;
-      dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z;
-      dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x38*y;
+      dang_eval_y_0 = x*(x36 + x55);
+      dang_eval_z_0 = x63*x65;
+      dang_eval_x_1 = x39 + x41;
+      dang_eval_y_1 = x44;
+      dang_eval_z_1 = x*x66;
+      dang_eval_x_2 = x15*x42;
+      dang_eval_y_2 = radial_eval_alpha*x56 + x11*x19;
+      dang_eval_z_2 = x15*x61;
+      dang_eval_x_3 = x43;
+      dang_eval_y_3 = x51*z;
+      dang_eval_z_3 = x30 + x58;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
@@ -321,15 +485,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 11*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 11*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z;
-      dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z;
-      dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z;
-      dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z;
-      dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x16*x42;
+      dang_eval_y_0 = x48*x8 + x58;
+      dang_eval_z_0 = x5*x62 + x60;
+      dang_eval_x_1 = x44;
+      dang_eval_y_1 = x39 + x60;
+      dang_eval_z_1 = x66*y;
+      dang_eval_x_2 = x17*x42;
+      dang_eval_y_2 = x17*x45;
+      dang_eval_z_2 = radial_eval_alpha*x67 + x14*x19;
       basis_x_eval[ipt + 12*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 12*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 12*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp
index 0788c8ce..2dd909dc 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -105,6 +109,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp
index 4dd7dac5..2b74a4f0 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,7 +102,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -119,6 +123,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp
index 6dff65ca..bb156174 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_0(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_hessian_0(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,7 +111,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha_squared*(x*x); 
+      const auto x1 = radial_eval_alpha_squared*x; 
+      const auto x2 = radial_eval_alpha_squared*(y*y); 
+      const auto x3 = radial_eval_alpha_squared*(z*z); 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -125,22 +133,24 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*x*x;
+      basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x0;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y;
+      basis_xy_eval[ipt + 0*npts] = x1*y;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*z;
+      basis_xz_eval[ipt + 0*npts] = x1*z;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*y*y;
+      basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x2;
 
       // Evaluate second derivative of bfn wrt yz
       basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*y*z;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*z*z;
+      basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x3;
+
+
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp
new file mode 100644
index 00000000..a6f5542c
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp
@@ -0,0 +1,208 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_0(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = x*x; 
+      const auto x1 = radial_eval_alpha_squared*x0; 
+      const auto x2 = radial_eval_alpha_squared*x; 
+      const auto x3 = y*y; 
+      const auto x4 = radial_eval_alpha_squared*x3; 
+      const auto x5 = radial_eval_alpha_squared*y; 
+      const auto x6 = z*z; 
+      const auto x7 = radial_eval_alpha_squared*x6; 
+      const auto x8 = radial_eval_alpha_cubed*x; 
+      const auto x9 = radial_eval_alpha_cubed*y; 
+      const auto x10 = radial_eval_alpha_cubed*z; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = radial_eval_alpha*y;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x1;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x2*y;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x2*z;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x4;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x5*z;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x7;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x1 + x4 + x7;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(x*x*x) + 5.0*x2 + x3*x8 + x6*x8;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(y*y*y) + x0*x9 + 5.0*x5 + x6*x9;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(z*z*z) + 5.0*radial_eval_alpha_squared*z + x0*x10 + x10*x3;
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+
+
+      ang_eval_0 = radial_eval;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+
+      dang_eval_x_0 = radial_eval_alpha*x;
+      dang_eval_y_0 = radial_eval_alpha*y;
+      dang_eval_z_0 = radial_eval_alpha*z;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp
index de0353b5..fad0a511 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_0(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_0(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,7 +106,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha_squared*(x*x); 
+      const auto x1 = radial_eval_alpha_squared*x; 
+      const auto x2 = radial_eval_alpha_squared*(y*y); 
+      const auto x3 = radial_eval_alpha_squared*(z*z); 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval;
@@ -119,8 +127,10 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       // Evaluate first derivative of bfn wrt z
       basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z;
 
+
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = 3*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z;
+      basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x0 + x2 + x3;
+
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp
index 709c0298..9e867997 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*y;
@@ -107,6 +111,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp
index 0fe5eb9d..bed3c691 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_1(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_gradient_1(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,7 +102,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha*x; 
+      const auto x1 = x0*y; 
+      const auto x2 = x0*z; 
+      const auto x3 = radial_eval_alpha*y*z; 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*y;
@@ -109,19 +117,21 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x*y;
-      basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*z;
-      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x;
+      basis_x_eval[ipt + 0*npts] = x1;
+      basis_x_eval[ipt + 1*npts] = x2;
+      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*(x*x);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*y*y;
-      basis_y_eval[ipt + 1*npts] = radial_eval_alpha*y*z;
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y;
+      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*(y*y);
+      basis_y_eval[ipt + 1*npts] = x3;
+      basis_y_eval[ipt + 2*npts] = x1;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*y*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*z*z;
-      basis_z_eval[ipt + 2*npts] = radial_eval_alpha*x*z;
+      basis_z_eval[ipt + 0*npts] = x3;
+      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*(z*z);
+      basis_z_eval[ipt + 2*npts] = x2;
+
+
 
 
 
@@ -149,15 +159,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y;
-      dang_eval_y_0 = radial_eval + radial_eval_alpha*y*y;
-      dang_eval_z_0 = radial_eval_alpha*y*z;
-      dang_eval_x_1 = radial_eval_alpha*x*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z;
-      dang_eval_z_1 = radial_eval + radial_eval_alpha*z*z;
-      dang_eval_x_2 = radial_eval + radial_eval_alpha*x*x;
-      dang_eval_y_2 = radial_eval_alpha*x*y;
-      dang_eval_z_2 = radial_eval_alpha*x*z;
+      dang_eval_x_0 = x1;
+      dang_eval_y_0 = radial_eval + radial_eval_alpha*(y*y);
+      dang_eval_z_0 = x3;
+      dang_eval_x_1 = x2;
+      dang_eval_y_1 = x3;
+      dang_eval_z_1 = radial_eval + radial_eval_alpha*(z*z);
+      dang_eval_x_2 = radial_eval + radial_eval_alpha*(x*x);
+      dang_eval_y_2 = x1;
+      dang_eval_z_2 = x2;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp
index e70d24f8..273f5df5 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_1(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_1(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,7 +111,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha*x; 
+      const auto x1 = x0*y; 
+      const auto x2 = x0*z; 
+      const auto x3 = x*x; 
+      const auto x4 = y*y; 
+      const auto x5 = y*z; 
+      const auto x6 = radial_eval_alpha*x5; 
+      const auto x7 = z*z; 
+      const auto x8 = radial_eval_alpha_squared*x3; 
+      const auto x9 = radial_eval_alpha + x8; 
+      const auto x10 = x9*y; 
+      const auto x11 = x9*z; 
+      const auto x12 = 3.0*radial_eval_alpha; 
+      const auto x13 = radial_eval_alpha_squared*x4; 
+      const auto x14 = radial_eval_alpha + x13; 
+      const auto x15 = x*x14; 
+      const auto x16 = radial_eval_alpha_squared*x*x5; 
+      const auto x17 = radial_eval_alpha_squared*x7; 
+      const auto x18 = radial_eval_alpha + x17; 
+      const auto x19 = x*x18; 
+      const auto x20 = x14*z; 
+      const auto x21 = x18*y; 
+      const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x8; 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*y;
@@ -118,49 +145,51 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x*y;
-      basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*z;
-      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x;
+      basis_x_eval[ipt + 0*npts] = x1;
+      basis_x_eval[ipt + 1*npts] = x2;
+      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x3;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*y*y;
-      basis_y_eval[ipt + 1*npts] = radial_eval_alpha*y*z;
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y;
+      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x4;
+      basis_y_eval[ipt + 1*npts] = x6;
+      basis_y_eval[ipt + 2*npts] = x1;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*y*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*z*z;
-      basis_z_eval[ipt + 2*npts] = radial_eval_alpha*x*z;
+      basis_z_eval[ipt + 0*npts] = x6;
+      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x7;
+      basis_z_eval[ipt + 2*npts] = x2;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 1*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 2*npts] = x*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xx_eval[ipt + 0*npts] = x10;
+      basis_xx_eval[ipt + 1*npts] = x11;
+      basis_xx_eval[ipt + 2*npts] = x*(x12 + x8);
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x*y*z;
-      basis_xy_eval[ipt + 2*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xy_eval[ipt + 0*npts] = x15;
+      basis_xy_eval[ipt + 1*npts] = x16;
+      basis_xy_eval[ipt + 2*npts] = x10;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y*z;
-      basis_xz_eval[ipt + 1*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 2*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
+      basis_xz_eval[ipt + 0*npts] = x16;
+      basis_xz_eval[ipt + 1*npts] = x19;
+      basis_xz_eval[ipt + 2*npts] = x11;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 1*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
+      basis_yy_eval[ipt + 0*npts] = y*(x12 + x13);
+      basis_yy_eval[ipt + 1*npts] = x20;
+      basis_yy_eval[ipt + 2*npts] = x15;
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 1*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x*y*z;
+      basis_yz_eval[ipt + 0*npts] = x20;
+      basis_yz_eval[ipt + 1*npts] = x21;
+      basis_yz_eval[ipt + 2*npts] = x16;
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 1*npts] = z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
+      basis_zz_eval[ipt + 0*npts] = x21;
+      basis_zz_eval[ipt + 1*npts] = z*(x12 + x17);
+      basis_zz_eval[ipt + 2*npts] = x19;
+
+
 
 
 
@@ -187,15 +216,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y;
-      dang_eval_y_0 = radial_eval + radial_eval_alpha*y*y;
-      dang_eval_z_0 = radial_eval_alpha*y*z;
-      dang_eval_x_1 = radial_eval_alpha*x*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z;
-      dang_eval_z_1 = radial_eval + radial_eval_alpha*z*z;
-      dang_eval_x_2 = radial_eval + radial_eval_alpha*x*x;
-      dang_eval_y_2 = radial_eval_alpha*x*y;
-      dang_eval_z_2 = radial_eval_alpha*x*z;
+      dang_eval_x_0 = x1;
+      dang_eval_y_0 = radial_eval + radial_eval_alpha*x4;
+      dang_eval_z_0 = x6;
+      dang_eval_x_1 = x2;
+      dang_eval_y_1 = x6;
+      dang_eval_z_1 = radial_eval + radial_eval_alpha*x7;
+      dang_eval_x_2 = radial_eval + radial_eval_alpha*x3;
+      dang_eval_y_2 = x1;
+      dang_eval_z_2 = x2;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp
new file mode 100644
index 00000000..e0983fed
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp
@@ -0,0 +1,285 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_1(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha*x; 
+      const auto x1 = x0*y; 
+      const auto x2 = x0*z; 
+      const auto x3 = x*x; 
+      const auto x4 = x3; 
+      const auto x5 = y*y; 
+      const auto x6 = x5; 
+      const auto x7 = y*z; 
+      const auto x8 = radial_eval_alpha*x7; 
+      const auto x9 = z*z; 
+      const auto x10 = x9; 
+      const auto x11 = radial_eval_alpha_squared*x4; 
+      const auto x12 = radial_eval_alpha + x11; 
+      const auto x13 = x12*y; 
+      const auto x14 = x12*z; 
+      const auto x15 = 3.0*radial_eval_alpha; 
+      const auto x16 = radial_eval_alpha_squared*x6; 
+      const auto x17 = radial_eval_alpha + x16; 
+      const auto x18 = x*x17; 
+      const auto x19 = radial_eval_alpha_squared*x*x7; 
+      const auto x20 = radial_eval_alpha_squared*x10; 
+      const auto x21 = radial_eval_alpha + x20; 
+      const auto x22 = x*x21; 
+      const auto x23 = x17*z; 
+      const auto x24 = x21*y; 
+      const auto x25 = 5.0*radial_eval_alpha; 
+      const auto x26 = x16 + x20 + x25; 
+      const auto x27 = x11 + x26; 
+      const auto x28 = 5.0*radial_eval_alpha_squared; 
+      const auto x29 = radial_eval_alpha_cubed*(x*x*x); 
+      const auto x30 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; 
+      const auto x31 = radial_eval_alpha_cubed*x10 + radial_eval_alpha_squared; 
+      const auto x32 = x*x28 + x*x30 + x*x31 + x29; 
+      const auto x33 = 3.0*radial_eval_alpha_squared; 
+      const auto x34 = radial_eval_alpha_cubed*(y*y*y); 
+      const auto x35 = radial_eval_alpha_cubed*x4 + radial_eval_alpha_squared; 
+      const auto x36 = x11 + x25; 
+      const auto x37 = x28*y + x31*y + x34 + x35*y; 
+      const auto x38 = radial_eval_alpha_cubed*(z*z*z); 
+      const auto x39 = x28*z + x30*z + x35*z + x38; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*y;
+      basis_eval[ipt + 1*npts] = radial_eval*z;
+      basis_eval[ipt + 2*npts] = radial_eval*x;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = x1;
+      basis_x_eval[ipt + 1*npts] = x2;
+      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x4;
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x6;
+      basis_y_eval[ipt + 1*npts] = x8;
+      basis_y_eval[ipt + 2*npts] = x1;
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x8;
+      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x10;
+      basis_z_eval[ipt + 2*npts] = x2;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x13;
+      basis_xx_eval[ipt + 1*npts] = x14;
+      basis_xx_eval[ipt + 2*npts] = x*(x11 + x15);
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x18;
+      basis_xy_eval[ipt + 1*npts] = x19;
+      basis_xy_eval[ipt + 2*npts] = x13;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x19;
+      basis_xz_eval[ipt + 1*npts] = x22;
+      basis_xz_eval[ipt + 2*npts] = x14;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = y*(x15 + x16);
+      basis_yy_eval[ipt + 1*npts] = x23;
+      basis_yy_eval[ipt + 2*npts] = x18;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x23;
+      basis_yz_eval[ipt + 1*npts] = x24;
+      basis_yz_eval[ipt + 2*npts] = x19;
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x24;
+      basis_zz_eval[ipt + 1*npts] = z*(x15 + x20);
+      basis_zz_eval[ipt + 2*npts] = x22;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x27*y;
+      basis_lapl_eval[ipt + 1*npts] = x27*z;
+      basis_lapl_eval[ipt + 2*npts] = x*x27;
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x32*y;
+      basis_lapl_x_eval[ipt + 1*npts] = x32*z;
+      basis_lapl_x_eval[ipt + 2*npts] = x*(x*x33 + x29) + x26 + x3*x30 + x3*x31 + x33*x4;
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x20 + x31*x5 + x33*x6 + x35*x5 + x36 + y*(x33*y + x34);
+      basis_lapl_y_eval[ipt + 1*npts] = x37*z;
+      basis_lapl_y_eval[ipt + 2*npts] = x*x37;
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x39*y;
+      basis_lapl_z_eval[ipt + 1*npts] = x10*x33 + x16 + x30*x9 + x35*x9 + x36 + z*(x33*z + x38);
+      basis_lapl_z_eval[ipt + 2*npts] = x*x39;
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+
+
+      ang_eval_0 = radial_eval*y;
+      ang_eval_1 = radial_eval*z;
+      ang_eval_2 = radial_eval*x;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+
+      dang_eval_x_0 = x1;
+      dang_eval_y_0 = radial_eval + radial_eval_alpha*x6;
+      dang_eval_z_0 = x8;
+      dang_eval_x_1 = x2;
+      dang_eval_y_1 = x8;
+      dang_eval_z_1 = radial_eval + radial_eval_alpha*x10;
+      dang_eval_x_2 = radial_eval + radial_eval_alpha*x4;
+      dang_eval_y_2 = x1;
+      dang_eval_z_2 = x2;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp
index 32575b6e..2da0a731 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_1(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_1(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,7 +106,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval_alpha*x; 
+      const auto x1 = x0*y; 
+      const auto x2 = x0*z; 
+      const auto x3 = x*x; 
+      const auto x4 = y*y; 
+      const auto x5 = y*z; 
+      const auto x6 = radial_eval_alpha*x5; 
+      const auto x7 = z*z; 
+      const auto x8 = radial_eval_alpha_squared*x3; 
+      const auto x9 = radial_eval_alpha + x8; 
+      const auto x10 = x9*y; 
+      const auto x11 = x9*z; 
+      const auto x12 = 3.0*radial_eval_alpha; 
+      const auto x13 = radial_eval_alpha_squared*x4; 
+      const auto x14 = radial_eval_alpha + x13; 
+      const auto x15 = x*x14; 
+      const auto x16 = radial_eval_alpha_squared*x*x5; 
+      const auto x17 = radial_eval_alpha_squared*x7; 
+      const auto x18 = radial_eval_alpha + x17; 
+      const auto x19 = x*x18; 
+      const auto x20 = x14*z; 
+      const auto x21 = x18*y; 
+      const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x8; 
+
 
       // Evaluate basis function
       basis_eval[ipt + 0*npts] = radial_eval*y;
@@ -113,24 +140,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x*y;
-      basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*z;
-      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x;
+      basis_x_eval[ipt + 0*npts] = x1;
+      basis_x_eval[ipt + 1*npts] = x2;
+      basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x3;
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*y*y;
-      basis_y_eval[ipt + 1*npts] = radial_eval_alpha*y*z;
-      basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y;
+      basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x4;
+      basis_y_eval[ipt + 1*npts] = x6;
+      basis_y_eval[ipt + 2*npts] = x1;
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = radial_eval_alpha*y*z;
-      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*z*z;
-      basis_z_eval[ipt + 2*npts] = radial_eval_alpha*x*z;
+      basis_z_eval[ipt + 0*npts] = x6;
+      basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x7;
+      basis_z_eval[ipt + 2*npts] = x2;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = y*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 1*npts] = z*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 2*npts] = x*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
+      basis_lapl_eval[ipt + 0*npts] = x22*y;
+      basis_lapl_eval[ipt + 1*npts] = x22*z;
+      basis_lapl_eval[ipt + 2*npts] = x*x22;
+
 
 
 
@@ -157,15 +186,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
 
-      dang_eval_x_0 = radial_eval_alpha*x*y;
-      dang_eval_y_0 = radial_eval + radial_eval_alpha*y*y;
-      dang_eval_z_0 = radial_eval_alpha*y*z;
-      dang_eval_x_1 = radial_eval_alpha*x*z;
-      dang_eval_y_1 = radial_eval_alpha*y*z;
-      dang_eval_z_1 = radial_eval + radial_eval_alpha*z*z;
-      dang_eval_x_2 = radial_eval + radial_eval_alpha*x*x;
-      dang_eval_y_2 = radial_eval_alpha*x*y;
-      dang_eval_z_2 = radial_eval_alpha*x*z;
+      dang_eval_x_0 = x1;
+      dang_eval_y_0 = radial_eval + radial_eval_alpha*x4;
+      dang_eval_z_0 = x6;
+      dang_eval_x_1 = x2;
+      dang_eval_y_1 = x6;
+      dang_eval_z_1 = radial_eval + radial_eval_alpha*x7;
+      dang_eval_x_2 = radial_eval + radial_eval_alpha*x3;
+      dang_eval_y_2 = x1;
+      dang_eval_z_2 = x2;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp
index f29e2496..38e16774 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,14 +96,19 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+      const auto x0 = radial_eval*sqrt_3*y; 
+      const auto x1 = 0.5*radial_eval; 
+      const auto x2 = x*x; 
+      const auto x3 = y*y; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y;
-      basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z;
-      basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z;
-      basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2;
+      basis_eval[ipt + 0*npts] = x*x0;
+      basis_eval[ipt + 1*npts] = x0*z;
+      basis_eval[ipt + 2*npts] = -x1*(x2 + x3 - 2.0*z*z);
+      basis_eval[ipt + 3*npts] = radial_eval*sqrt_3*x*z;
+      basis_eval[ipt + 4*npts] = sqrt_3*x1*(x2 - x3);
 
 
     
@@ -109,6 +117,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
@@ -120,16 +130,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_3*radial_eval*x*y;
-      ang_eval_1 = sqrt_3*radial_eval*y*z;
-      ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      ang_eval_3 = sqrt_3*radial_eval*x*z;
+      ang_eval_0 = x*x0;
+      ang_eval_1 = x0*z;
+      ang_eval_2 = -x1*(x2 + x3 - 2.0*z*z);
+      ang_eval_3 = radial_eval*sqrt_3*x*z;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2;
+      ang_eval_0 = sqrt_3*x1*(x2 - x3);
       basis_eval[ipt + 4*npts] = ang_eval_0;
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp
index c14931c8..52ddc601 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_2(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_gradient_2(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,37 +102,60 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = sqrt_3*y; 
+      const auto x1 = radial_eval*x0; 
+      const auto x2 = 0.5*radial_eval; 
+      const auto x3 = x*x; 
+      const auto x4 = y*y; 
+      const auto x5 = z*z; 
+      const auto x6 = -x3 - x4 + 2.0*x5; 
+      const auto x7 = sqrt_3*z; 
+      const auto x8 = x3 - x4; 
+      const auto x9 = radial_eval + radial_eval_alpha*x3; 
+      const auto x10 = radial_eval_alpha*x*x0*z; 
+      const auto x11 = 0.5*x; 
+      const auto x12 = 2.0*radial_eval; 
+      const auto x13 = -x12; 
+      const auto x14 = radial_eval_alpha*x6; 
+      const auto x15 = x13 + x14; 
+      const auto x16 = radial_eval_alpha*x8; 
+      const auto x17 = sqrt_3*x; 
+      const auto x18 = radial_eval + radial_eval_alpha*x4; 
+      const auto x19 = radial_eval + radial_eval_alpha*x5; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y;
-      basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z;
-      basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z;
-      basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2;
+      basis_eval[ipt + 0*npts] = x*x1;
+      basis_eval[ipt + 1*npts] = x1*z;
+      basis_eval[ipt + 2*npts] = x2*x6;
+      basis_eval[ipt + 3*npts] = radial_eval*x*x7;
+      basis_eval[ipt + 4*npts] = sqrt_3*x2*x8;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
+      basis_x_eval[ipt + 0*npts] = x0*x9;
+      basis_x_eval[ipt + 1*npts] = x10;
+      basis_x_eval[ipt + 2*npts] = x11*x15;
+      basis_x_eval[ipt + 3*npts] = x7*x9;
+      basis_x_eval[ipt + 4*npts] = sqrt_3*x11*(x12 + x16);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
+      basis_y_eval[ipt + 0*npts] = x17*x18;
+      basis_y_eval[ipt + 1*npts] = x18*x7;
+      basis_y_eval[ipt + 2*npts] = 0.5*x15*y;
+      basis_y_eval[ipt + 3*npts] = x10;
+      basis_y_eval[ipt + 4*npts] = 0.5*x0*(x13 + x16);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_z_eval[ipt + 1*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 2*npts] = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_z_eval[ipt + 3*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2;
+      basis_z_eval[ipt + 0*npts] = x10;
+      basis_z_eval[ipt + 1*npts] = x0*x19;
+      basis_z_eval[ipt + 2*npts] = 0.5*z*(4.0*radial_eval + x14);
+      basis_z_eval[ipt + 3*npts] = x17*x19;
+      basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x7*x8;
+
+
 
 
 
@@ -146,16 +172,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_3*radial_eval*x*y;
-      ang_eval_1 = sqrt_3*radial_eval*y*z;
-      ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      ang_eval_3 = sqrt_3*radial_eval*x*z;
+      ang_eval_0 = x*x1;
+      ang_eval_1 = x1*z;
+      ang_eval_2 = x2*x6;
+      ang_eval_3 = radial_eval*x*x7;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2;
+      ang_eval_0 = sqrt_3*x2*x8;
       basis_eval[ipt + 4*npts] = ang_eval_0;
 
 
@@ -164,18 +190,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_z_3 = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x0*x9;
+      dang_eval_y_0 = x17*x18;
+      dang_eval_z_0 = x10;
+      dang_eval_x_1 = x10;
+      dang_eval_y_1 = x18*x7;
+      dang_eval_z_1 = x0*x19;
+      dang_eval_x_2 = x11*x15;
+      dang_eval_y_2 = 0.5*x15*y;
+      dang_eval_z_2 = 0.5*z*(4.0*radial_eval + x14);
+      dang_eval_x_3 = x7*x9;
+      dang_eval_y_3 = x10;
+      dang_eval_z_3 = x17*x19;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -189,9 +215,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_y_0 = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_z_0 = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2;
+      dang_eval_x_0 = sqrt_3*x11*(x12 + x16);
+      dang_eval_y_0 = 0.5*x0*(x13 + x16);
+      dang_eval_z_0 = 0.5*radial_eval_alpha*x7*x8;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp
index 400ee30e..329138f3 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_2(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_2(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,79 +111,137 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = sqrt_3*y; 
+      const auto x1 = x*x0; 
+      const auto x2 = x0*z; 
+      const auto x3 = 0.5*radial_eval; 
+      const auto x4 = x*x; 
+      const auto x5 = y*y; 
+      const auto x6 = z*z; 
+      const auto x7 = -x4 - x5 + 2.0*x6; 
+      const auto x8 = sqrt_3*z; 
+      const auto x9 = x*x8; 
+      const auto x10 = x4 - x5; 
+      const auto x11 = radial_eval + radial_eval_alpha*x4; 
+      const auto x12 = radial_eval_alpha*x1*z; 
+      const auto x13 = 0.5*x; 
+      const auto x14 = 2.0*radial_eval; 
+      const auto x15 = -x14; 
+      const auto x16 = radial_eval_alpha*x7; 
+      const auto x17 = x15 + x16; 
+      const auto x18 = radial_eval_alpha*x10; 
+      const auto x19 = sqrt_3*x; 
+      const auto x20 = radial_eval_alpha*x5; 
+      const auto x21 = radial_eval + x20; 
+      const auto x22 = 0.5*y; 
+      const auto x23 = radial_eval_alpha*x6; 
+      const auto x24 = radial_eval + x23; 
+      const auto x25 = 0.5*z; 
+      const auto x26 = 4.0*radial_eval; 
+      const auto x27 = 3.0*radial_eval_alpha; 
+      const auto x28 = radial_eval_alpha_squared*x4; 
+      const auto x29 = x27 + x28; 
+      const auto x30 = radial_eval_alpha + x28; 
+      const auto x31 = x2*x30; 
+      const auto x32 = 4.0*radial_eval_alpha; 
+      const auto x33 = x32*x4; 
+      const auto x34 = x14 + x33; 
+      const auto x35 = 0.5*sqrt_3; 
+      const auto x36 = x10*x30; 
+      const auto x37 = radial_eval_alpha_squared*x5; 
+      const auto x38 = radial_eval_alpha + x37; 
+      const auto x39 = x38*x9; 
+      const auto x40 = radial_eval_alpha_squared*x7; 
+      const auto x41 = radial_eval_alpha_squared*x6; 
+      const auto x42 = radial_eval_alpha + x41; 
+      const auto x43 = x1*x42; 
+      const auto x44 = 2.0*radial_eval_alpha; 
+      const auto x45 = x40 + x44; 
+      const auto x46 = radial_eval_alpha_squared*x10; 
+      const auto x47 = x27 + x37; 
+      const auto x48 = x32*x5; 
+      const auto x49 = x14 + x48; 
+      const auto x50 = x27 + x41; 
+      const auto x51 = 8.0*radial_eval_alpha*x6 + x42*x7; 
+      const auto x52 = x10*x42; 
+      const auto x53 = 7.0*radial_eval_alpha + x28 + x37 + x41; 
+      const auto x54 = -x48; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y;
-      basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z;
-      basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z;
-      basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2;
+      basis_eval[ipt + 0*npts] = radial_eval*x1;
+      basis_eval[ipt + 1*npts] = radial_eval*x2;
+      basis_eval[ipt + 2*npts] = x3*x7;
+      basis_eval[ipt + 3*npts] = radial_eval*x9;
+      basis_eval[ipt + 4*npts] = sqrt_3*x10*x3;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
+      basis_x_eval[ipt + 0*npts] = x0*x11;
+      basis_x_eval[ipt + 1*npts] = x12;
+      basis_x_eval[ipt + 2*npts] = x13*x17;
+      basis_x_eval[ipt + 3*npts] = x11*x8;
+      basis_x_eval[ipt + 4*npts] = sqrt_3*x13*(x14 + x18);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
+      basis_y_eval[ipt + 0*npts] = x19*x21;
+      basis_y_eval[ipt + 1*npts] = x21*x8;
+      basis_y_eval[ipt + 2*npts] = x17*x22;
+      basis_y_eval[ipt + 3*npts] = x12;
+      basis_y_eval[ipt + 4*npts] = 0.5*x0*(x15 + x18);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_z_eval[ipt + 1*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 2*npts] = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_z_eval[ipt + 3*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2;
+      basis_z_eval[ipt + 0*npts] = x12;
+      basis_z_eval[ipt + 1*npts] = x0*x24;
+      basis_z_eval[ipt + 2*npts] = x25*(x16 + x26);
+      basis_z_eval[ipt + 3*npts] = x19*x24;
+      basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x10*x8;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = sqrt_3*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 1*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 2*npts] = -radial_eval - 2*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 2*z*z)/2;
-      basis_xx_eval[ipt + 3*npts] = sqrt_3*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 4*npts] = sqrt_3*(radial_eval + 2*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y)/2);
+      basis_xx_eval[ipt + 0*npts] = x1*x29;
+      basis_xx_eval[ipt + 1*npts] = x31;
+      basis_xx_eval[ipt + 2*npts] = 0.5*x30*x7 - 0.5*x34;
+      basis_xx_eval[ipt + 3*npts] = x29*x9;
+      basis_xx_eval[ipt + 4*npts] = x35*(x34 + x36);
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 1*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_xy_eval[ipt + 2*npts] = x*y*(-4*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2;
-      basis_xy_eval[ipt + 3*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xy_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha_squared*x*y*(x*x - y*y)/2;
+      basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval_alpha_squared*x4*x5 + x11 + x20);
+      basis_xy_eval[ipt + 1*npts] = x39;
+      basis_xy_eval[ipt + 2*npts] = x13*y*(-x32 + x40);
+      basis_xy_eval[ipt + 3*npts] = x31;
+      basis_xy_eval[ipt + 4*npts] = radial_eval_alpha_squared*x0*x10*x13;
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xz_eval[ipt + 1*npts] = sqrt_3*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_xz_eval[ipt + 2*npts] = x*z*(2*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2;
-      basis_xz_eval[ipt + 3*npts] = sqrt_3*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 4*npts] = sqrt_3*x*z*(2*radial_eval_alpha + radial_eval_alpha_squared*(x*x - y*y))/2;
+      basis_xz_eval[ipt + 0*npts] = x31;
+      basis_xz_eval[ipt + 1*npts] = x43;
+      basis_xz_eval[ipt + 2*npts] = x13*x45*z;
+      basis_xz_eval[ipt + 3*npts] = sqrt_3*(radial_eval_alpha_squared*x4*x6 + x11 + x23);
+      basis_xz_eval[ipt + 4*npts] = x13*x8*(x44 + x46);
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = sqrt_3*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 1*npts] = sqrt_3*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = -radial_eval - 2*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 2*z*z)/2;
-      basis_yy_eval[ipt + 3*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 4*npts] = sqrt_3*(-radial_eval - 2*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y)/2);
+      basis_yy_eval[ipt + 0*npts] = x1*x47;
+      basis_yy_eval[ipt + 1*npts] = x2*x47;
+      basis_yy_eval[ipt + 2*npts] = 0.5*x38*x7 - 0.5*x49;
+      basis_yy_eval[ipt + 3*npts] = x39;
+      basis_yy_eval[ipt + 4*npts] = x35*(x10*x38 - x49);
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yz_eval[ipt + 1*npts] = sqrt_3*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 2*npts] = y*z*(2*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2;
-      basis_yz_eval[ipt + 3*npts] = sqrt_3*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_yz_eval[ipt + 4*npts] = sqrt_3*y*z*(-2*radial_eval_alpha + radial_eval_alpha_squared*(x*x - y*y))/2;
+      basis_yz_eval[ipt + 0*npts] = x39;
+      basis_yz_eval[ipt + 1*npts] = sqrt_3*(radial_eval_alpha_squared*x5*x6 + x21 + x23);
+      basis_yz_eval[ipt + 2*npts] = x22*x45*z;
+      basis_yz_eval[ipt + 3*npts] = x43;
+      basis_yz_eval[ipt + 4*npts] = x0*x25*(-x44 + x46);
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = sqrt_3*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 1*npts] = sqrt_3*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = 2*radial_eval + 4*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 2*z*z)/2;
-      basis_zz_eval[ipt + 3*npts] = sqrt_3*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 4*npts] = sqrt_3*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2;
+      basis_zz_eval[ipt + 0*npts] = x43;
+      basis_zz_eval[ipt + 1*npts] = x2*x50;
+      basis_zz_eval[ipt + 2*npts] = 0.5*x26 + 0.5*x51;
+      basis_zz_eval[ipt + 3*npts] = x50*x9;
+      basis_zz_eval[ipt + 4*npts] = x35*x52;
+
+
 
 
 
@@ -196,16 +257,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_3*radial_eval*x*y;
-      ang_eval_1 = sqrt_3*radial_eval*y*z;
-      ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      ang_eval_3 = sqrt_3*radial_eval*x*z;
+      ang_eval_0 = radial_eval*x1;
+      ang_eval_1 = radial_eval*x2;
+      ang_eval_2 = x3*x7;
+      ang_eval_3 = radial_eval*x9;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2;
+      ang_eval_0 = sqrt_3*x10*x3;
       basis_eval[ipt + 4*npts] = ang_eval_0;
 
 
@@ -214,18 +275,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_z_3 = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x0*x11;
+      dang_eval_y_0 = x19*x21;
+      dang_eval_z_0 = x12;
+      dang_eval_x_1 = x12;
+      dang_eval_y_1 = x21*x8;
+      dang_eval_z_1 = x0*x24;
+      dang_eval_x_2 = x13*x17;
+      dang_eval_y_2 = x17*x22;
+      dang_eval_z_2 = x25*(x16 + x26);
+      dang_eval_x_3 = x11*x8;
+      dang_eval_y_3 = x12;
+      dang_eval_z_3 = x19*x24;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -239,9 +300,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_y_0 = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_z_0 = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2;
+      dang_eval_x_0 = sqrt_3*x13*(x14 + x18);
+      dang_eval_y_0 = 0.5*x0*(x15 + x18);
+      dang_eval_z_0 = 0.5*radial_eval_alpha*x10*x8;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp
new file mode 100644
index 00000000..9a284576
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp
@@ -0,0 +1,386 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_2(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = sqrt_3*y; 
+      const auto x1 = x*x0; 
+      const auto x2 = x0*z; 
+      const auto x3 = 0.5*radial_eval; 
+      const auto x4 = x*x; 
+      const auto x5 = x4; 
+      const auto x6 = y*y; 
+      const auto x7 = x6; 
+      const auto x8 = z*z; 
+      const auto x9 = x8; 
+      const auto x10 = -x5 - x7 + 2.0*x9; 
+      const auto x11 = sqrt_3*z; 
+      const auto x12 = x*x11; 
+      const auto x13 = x5 - x7; 
+      const auto x14 = radial_eval + radial_eval_alpha*x5; 
+      const auto x15 = radial_eval_alpha*x1*z; 
+      const auto x16 = 0.5*x; 
+      const auto x17 = 2.0*radial_eval; 
+      const auto x18 = -x17; 
+      const auto x19 = radial_eval_alpha*x10; 
+      const auto x20 = x18 + x19; 
+      const auto x21 = radial_eval_alpha*x13; 
+      const auto x22 = sqrt_3*x; 
+      const auto x23 = radial_eval_alpha*x7; 
+      const auto x24 = radial_eval + x23; 
+      const auto x25 = 0.5*y; 
+      const auto x26 = radial_eval_alpha*x9; 
+      const auto x27 = radial_eval + x26; 
+      const auto x28 = 0.5*z; 
+      const auto x29 = 4.0*radial_eval; 
+      const auto x30 = 3.0*radial_eval_alpha; 
+      const auto x31 = radial_eval_alpha_squared*x5; 
+      const auto x32 = x30 + x31; 
+      const auto x33 = radial_eval_alpha + x31; 
+      const auto x34 = x2*x33; 
+      const auto x35 = 4.0*radial_eval_alpha; 
+      const auto x36 = x35*x5; 
+      const auto x37 = x17 + x36; 
+      const auto x38 = 0.5*sqrt_3; 
+      const auto x39 = x13*x33; 
+      const auto x40 = radial_eval_alpha_squared*x7; 
+      const auto x41 = radial_eval_alpha + x40; 
+      const auto x42 = x12*x41; 
+      const auto x43 = radial_eval_alpha_squared*x10; 
+      const auto x44 = radial_eval_alpha_squared*x9; 
+      const auto x45 = radial_eval_alpha + x44; 
+      const auto x46 = x1*x45; 
+      const auto x47 = 2.0*radial_eval_alpha; 
+      const auto x48 = x43 + x47; 
+      const auto x49 = radial_eval_alpha_squared*x13; 
+      const auto x50 = x30 + x40; 
+      const auto x51 = x35*x7; 
+      const auto x52 = x17 + x51; 
+      const auto x53 = x30 + x44; 
+      const auto x54 = 8.0*radial_eval_alpha; 
+      const auto x55 = x10*x45 + x54*x9; 
+      const auto x56 = x13*x45; 
+      const auto x57 = x40 + x44; 
+      const auto x58 = 7.0*radial_eval_alpha + x31 + x57; 
+      const auto x59 = -x51; 
+      const auto x60 = radial_eval_alpha_squared*x; 
+      const auto x61 = radial_eval_alpha_cubed*(x*x*x); 
+      const auto x62 = 3.0*x60 + x61; 
+      const auto x63 = radial_eval_alpha_cubed*x7 + radial_eval_alpha_squared; 
+      const auto x64 = radial_eval_alpha_cubed*x9 + radial_eval_alpha_squared; 
+      const auto x65 = 2.0*radial_eval_alpha_squared; 
+      const auto x66 = x*x62 + 3.0*x33 + x35 + x4*x63 + x4*x64 + x5*x65 + x57; 
+      const auto x67 = 4.0*x60*x7; 
+      const auto x68 = 2.0*x; 
+      const auto x69 = 6.0*x*x33 + x*x35 + x41*x68 + x45*x68; 
+      const auto x70 = x13*x63; 
+      const auto x71 = x13*x64; 
+      const auto x72 = radial_eval_alpha_squared*y; 
+      const auto x73 = radial_eval_alpha_cubed*(y*y*y); 
+      const auto x74 = 3.0*x72 + x73; 
+      const auto x75 = radial_eval_alpha_cubed*x5 + radial_eval_alpha_squared; 
+      const auto x76 = x31 + x35; 
+      const auto x77 = 3.0*x41 + x44 + x6*x64 + x6*x75 + x65*x7 + x74*y + x76; 
+      const auto x78 = x35*y; 
+      const auto x79 = 4.0*x5*x72; 
+      const auto x80 = 2.0*y; 
+      const auto x81 = x33*x80; 
+      const auto x82 = 6.0*x41*y; 
+      const auto x83 = x45*x80; 
+      const auto x84 = x13*x75; 
+      const auto x85 = radial_eval_alpha_squared*z; 
+      const auto x86 = radial_eval_alpha_cubed*(z*z*z); 
+      const auto x87 = 3.0*x85 + x86; 
+      const auto x88 = x40 + 3.0*x45 + x63*x8 + x65*x9 + x75*x8 + x76 + x87*z; 
+      const auto x89 = 4.0*z; 
+      const auto x90 = radial_eval_alpha_squared*x89; 
+      const auto x91 = x5*x90; 
+      const auto x92 = -x7*x90; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x1;
+      basis_eval[ipt + 1*npts] = radial_eval*x2;
+      basis_eval[ipt + 2*npts] = x10*x3;
+      basis_eval[ipt + 3*npts] = radial_eval*x12;
+      basis_eval[ipt + 4*npts] = sqrt_3*x13*x3;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = x0*x14;
+      basis_x_eval[ipt + 1*npts] = x15;
+      basis_x_eval[ipt + 2*npts] = x16*x20;
+      basis_x_eval[ipt + 3*npts] = x11*x14;
+      basis_x_eval[ipt + 4*npts] = sqrt_3*x16*(x17 + x21);
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x22*x24;
+      basis_y_eval[ipt + 1*npts] = x11*x24;
+      basis_y_eval[ipt + 2*npts] = x20*x25;
+      basis_y_eval[ipt + 3*npts] = x15;
+      basis_y_eval[ipt + 4*npts] = 0.5*x0*(x18 + x21);
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x15;
+      basis_z_eval[ipt + 1*npts] = x0*x27;
+      basis_z_eval[ipt + 2*npts] = x28*(x19 + x29);
+      basis_z_eval[ipt + 3*npts] = x22*x27;
+      basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x11*x13;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x1*x32;
+      basis_xx_eval[ipt + 1*npts] = x34;
+      basis_xx_eval[ipt + 2*npts] = 0.5*x10*x33 - 0.5*x37;
+      basis_xx_eval[ipt + 3*npts] = x12*x32;
+      basis_xx_eval[ipt + 4*npts] = x38*(x37 + x39);
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval_alpha_squared*x5*x7 + x14 + x23);
+      basis_xy_eval[ipt + 1*npts] = x42;
+      basis_xy_eval[ipt + 2*npts] = x16*y*(-x35 + x43);
+      basis_xy_eval[ipt + 3*npts] = x34;
+      basis_xy_eval[ipt + 4*npts] = radial_eval_alpha_squared*x0*x13*x16;
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x34;
+      basis_xz_eval[ipt + 1*npts] = x46;
+      basis_xz_eval[ipt + 2*npts] = x16*x48*z;
+      basis_xz_eval[ipt + 3*npts] = sqrt_3*(radial_eval_alpha_squared*x5*x9 + x14 + x26);
+      basis_xz_eval[ipt + 4*npts] = x11*x16*(x47 + x49);
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x1*x50;
+      basis_yy_eval[ipt + 1*npts] = x2*x50;
+      basis_yy_eval[ipt + 2*npts] = 0.5*x10*x41 - 0.5*x52;
+      basis_yy_eval[ipt + 3*npts] = x42;
+      basis_yy_eval[ipt + 4*npts] = x38*(x13*x41 - x52);
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x42;
+      basis_yz_eval[ipt + 1*npts] = sqrt_3*(radial_eval_alpha_squared*x7*x9 + x24 + x26);
+      basis_yz_eval[ipt + 2*npts] = x25*x48*z;
+      basis_yz_eval[ipt + 3*npts] = x46;
+      basis_yz_eval[ipt + 4*npts] = x0*x28*(-x47 + x49);
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x46;
+      basis_zz_eval[ipt + 1*npts] = x2*x53;
+      basis_zz_eval[ipt + 2*npts] = 0.5*x29 + 0.5*x55;
+      basis_zz_eval[ipt + 3*npts] = x12*x53;
+      basis_zz_eval[ipt + 4*npts] = x38*x56;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x1*x58;
+      basis_lapl_eval[ipt + 1*npts] = x2*x58;
+      basis_lapl_eval[ipt + 2*npts] = 0.5*x10*x33 + 0.5*x10*x41 - 0.5*x36 + 0.5*x55 + 0.5*x59;
+      basis_lapl_eval[ipt + 3*npts] = x12*x58;
+      basis_lapl_eval[ipt + 4*npts] = x38*(x13*x41 + x36 + x39 + x56 + x59);
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x0*x66;
+      basis_lapl_x_eval[ipt + 1*npts] = x2*(x*x63 + x*x64 + 7.0*x60 + x61);
+      basis_lapl_x_eval[ipt + 2*npts] = 4.0*radial_eval_alpha_squared*x*x9 + 0.5*x*x10*x63 + 0.5*x*x10*x64 + 0.5*x10*x62 - 0.5*x67 - 0.5*x69;
+      basis_lapl_x_eval[ipt + 3*npts] = x11*x66;
+      basis_lapl_x_eval[ipt + 4*npts] = x38*(x*x70 + x*x71 + x13*x62 - x67 + x69);
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x22*x77;
+      basis_lapl_y_eval[ipt + 1*npts] = x11*x77;
+      basis_lapl_y_eval[ipt + 2*npts] = 4.0*radial_eval_alpha_squared*x9*y + 0.5*x10*x64*y + 0.5*x10*x74 + 0.5*x10*x75*y - 0.5*x78 - 0.5*x79 - 0.5*x81 - 0.5*x82 - 0.5*x83;
+      basis_lapl_y_eval[ipt + 3*npts] = x12*(x64*y + 7.0*x72 + x73 + x75*y);
+      basis_lapl_y_eval[ipt + 4*npts] = x38*(x13*x74 + x71*y - x78 + x79 - x81 - x82 - x83 + x84*y);
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x1*(x63*z + x75*z + 7.0*x85 + x86);
+      basis_lapl_z_eval[ipt + 1*npts] = x0*x88;
+      basis_lapl_z_eval[ipt + 2*npts] = 0.5*x10*x63*z + 0.5*x10*x75*z + 0.5*x10*x87 + 0.5*x33*x89 + 0.5*x41*x89 + 6.0*x45*z + 0.5*x54*z - 0.5*x91 + 0.5*x92;
+      basis_lapl_z_eval[ipt + 3*npts] = x22*x88;
+      basis_lapl_z_eval[ipt + 4*npts] = x38*(x13*x87 + x70*z + x84*z + x91 + x92);
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+      double ang_eval_3;
+
+
+      ang_eval_0 = radial_eval*x1;
+      ang_eval_1 = radial_eval*x2;
+      ang_eval_2 = x10*x3;
+      ang_eval_3 = radial_eval*x12;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+      basis_eval[ipt + 3*npts] = ang_eval_3;
+
+      ang_eval_0 = sqrt_3*x13*x3;
+      basis_eval[ipt + 4*npts] = ang_eval_0;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+      double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
+
+      dang_eval_x_0 = x0*x14;
+      dang_eval_y_0 = x22*x24;
+      dang_eval_z_0 = x15;
+      dang_eval_x_1 = x15;
+      dang_eval_y_1 = x11*x24;
+      dang_eval_z_1 = x0*x27;
+      dang_eval_x_2 = x16*x20;
+      dang_eval_y_2 = x20*x25;
+      dang_eval_z_2 = x28*(x19 + x29);
+      dang_eval_x_3 = x11*x14;
+      dang_eval_y_3 = x15;
+      dang_eval_z_3 = x22*x27;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 3*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = sqrt_3*x16*(x17 + x21);
+      dang_eval_y_0 = 0.5*x0*(x18 + x21);
+      dang_eval_z_0 = 0.5*radial_eval_alpha*x11*x13;
+      basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp
index a3c3358c..7c731972 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_2(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_2(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,44 +106,102 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = sqrt_3*y; 
+      const auto x1 = x*x0; 
+      const auto x2 = x0*z; 
+      const auto x3 = 0.5*radial_eval; 
+      const auto x4 = x*x; 
+      const auto x5 = y*y; 
+      const auto x6 = z*z; 
+      const auto x7 = -x4 - x5 + 2.0*x6; 
+      const auto x8 = sqrt_3*z; 
+      const auto x9 = x*x8; 
+      const auto x10 = x4 - x5; 
+      const auto x11 = radial_eval + radial_eval_alpha*x4; 
+      const auto x12 = radial_eval_alpha*x1*z; 
+      const auto x13 = 0.5*x; 
+      const auto x14 = 2.0*radial_eval; 
+      const auto x15 = -x14; 
+      const auto x16 = radial_eval_alpha*x7; 
+      const auto x17 = x15 + x16; 
+      const auto x18 = radial_eval_alpha*x10; 
+      const auto x19 = sqrt_3*x; 
+      const auto x20 = radial_eval_alpha*x5; 
+      const auto x21 = radial_eval + x20; 
+      const auto x22 = 0.5*y; 
+      const auto x23 = radial_eval_alpha*x6; 
+      const auto x24 = radial_eval + x23; 
+      const auto x25 = 0.5*z; 
+      const auto x26 = 4.0*radial_eval; 
+      const auto x27 = 3.0*radial_eval_alpha; 
+      const auto x28 = radial_eval_alpha_squared*x4; 
+      const auto x29 = x27 + x28; 
+      const auto x30 = radial_eval_alpha + x28; 
+      const auto x31 = x2*x30; 
+      const auto x32 = 4.0*radial_eval_alpha; 
+      const auto x33 = x32*x4; 
+      const auto x34 = x14 + x33; 
+      const auto x35 = 0.5*sqrt_3; 
+      const auto x36 = x10*x30; 
+      const auto x37 = radial_eval_alpha_squared*x5; 
+      const auto x38 = radial_eval_alpha + x37; 
+      const auto x39 = x38*x9; 
+      const auto x40 = radial_eval_alpha_squared*x7; 
+      const auto x41 = radial_eval_alpha_squared*x6; 
+      const auto x42 = radial_eval_alpha + x41; 
+      const auto x43 = x1*x42; 
+      const auto x44 = 2.0*radial_eval_alpha; 
+      const auto x45 = x40 + x44; 
+      const auto x46 = radial_eval_alpha_squared*x10; 
+      const auto x47 = x27 + x37; 
+      const auto x48 = x32*x5; 
+      const auto x49 = x14 + x48; 
+      const auto x50 = x27 + x41; 
+      const auto x51 = 8.0*radial_eval_alpha*x6 + x42*x7; 
+      const auto x52 = x10*x42; 
+      const auto x53 = 7.0*radial_eval_alpha + x28 + x37 + x41; 
+      const auto x54 = -x48; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y;
-      basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z;
-      basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z;
-      basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2;
+      basis_eval[ipt + 0*npts] = radial_eval*x1;
+      basis_eval[ipt + 1*npts] = radial_eval*x2;
+      basis_eval[ipt + 2*npts] = x3*x7;
+      basis_eval[ipt + 3*npts] = radial_eval*x9;
+      basis_eval[ipt + 4*npts] = sqrt_3*x10*x3;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
+      basis_x_eval[ipt + 0*npts] = x0*x11;
+      basis_x_eval[ipt + 1*npts] = x12;
+      basis_x_eval[ipt + 2*npts] = x13*x17;
+      basis_x_eval[ipt + 3*npts] = x11*x8;
+      basis_x_eval[ipt + 4*npts] = sqrt_3*x13*(x14 + x18);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
+      basis_y_eval[ipt + 0*npts] = x19*x21;
+      basis_y_eval[ipt + 1*npts] = x21*x8;
+      basis_y_eval[ipt + 2*npts] = x17*x22;
+      basis_y_eval[ipt + 3*npts] = x12;
+      basis_y_eval[ipt + 4*npts] = 0.5*x0*(x15 + x18);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_3*radial_eval_alpha*x*y*z;
-      basis_z_eval[ipt + 1*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 2*npts] = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      basis_z_eval[ipt + 3*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2;
+      basis_z_eval[ipt + 0*npts] = x12;
+      basis_z_eval[ipt + 1*npts] = x0*x24;
+      basis_z_eval[ipt + 2*npts] = x25*(x16 + x26);
+      basis_z_eval[ipt + 3*npts] = x19*x24;
+      basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x10*x8;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = sqrt_3*x*y*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 1*npts] = sqrt_3*y*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 2*npts] = -7*radial_eval_alpha*x*x/2 - 7*radial_eval_alpha*y*y/2 + 7*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x/2 - radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z/2 - radial_eval_alpha_squared*y*y*y*y/2 + radial_eval_alpha_squared*y*y*z*z/2 + radial_eval_alpha_squared*z*z*z*z;
-      basis_lapl_eval[ipt + 3*npts] = sqrt_3*x*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 4*npts] = sqrt_3*(7*radial_eval_alpha*x*x - 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/2;
+      basis_lapl_eval[ipt + 0*npts] = x1*x53;
+      basis_lapl_eval[ipt + 1*npts] = x2*x53;
+      basis_lapl_eval[ipt + 2*npts] = 0.5*x30*x7 - 0.5*x33 + 0.5*x38*x7 + 0.5*x51 + 0.5*x54;
+      basis_lapl_eval[ipt + 3*npts] = x53*x9;
+      basis_lapl_eval[ipt + 4*npts] = x35*(x10*x38 + x33 + x36 + x52 + x54);
+
 
 
 
@@ -156,16 +217,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_3*radial_eval*x*y;
-      ang_eval_1 = sqrt_3*radial_eval*y*z;
-      ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2;
-      ang_eval_3 = sqrt_3*radial_eval*x*z;
+      ang_eval_0 = radial_eval*x1;
+      ang_eval_1 = radial_eval*x2;
+      ang_eval_2 = x3*x7;
+      ang_eval_3 = radial_eval*x9;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2;
+      ang_eval_0 = sqrt_3*x10*x3;
       basis_eval[ipt + 4*npts] = ang_eval_0;
 
 
@@ -174,18 +235,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_0 = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_0 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2;
-      dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z;
-      dang_eval_z_3 = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z);
+      dang_eval_x_0 = x0*x11;
+      dang_eval_y_0 = x19*x21;
+      dang_eval_z_0 = x12;
+      dang_eval_x_1 = x12;
+      dang_eval_y_1 = x21*x8;
+      dang_eval_z_1 = x0*x24;
+      dang_eval_x_2 = x13*x17;
+      dang_eval_y_2 = x17*x22;
+      dang_eval_z_2 = x25*(x16 + x26);
+      dang_eval_x_3 = x11*x8;
+      dang_eval_y_3 = x12;
+      dang_eval_z_3 = x19*x24;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -199,9 +260,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_y_0 = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_z_0 = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2;
+      dang_eval_x_0 = sqrt_3*x13*(x14 + x18);
+      dang_eval_y_0 = 0.5*x0*(x15 + x18);
+      dang_eval_z_0 = 0.5*radial_eval_alpha*x10*x8;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp
index c5e586b4..8c189f2a 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,16 +96,29 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.25*radial_eval; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x; 
+      const auto x3 = 3.0*x2; 
+      const auto x4 = y*y; 
+      const auto x5 = -x4; 
+      const auto x6 = radial_eval*z; 
+      const auto x7 = z*z; 
+      const auto x8 = -x2 - x4 + 4.0*x7; 
+      const auto x9 = 0.5*x6; 
+      const auto x10 = 3.0*x4; 
+      const auto x11 = x*x0; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z;
-      basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
-      basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      basis_eval[ipt + 0*npts] = sqrt_10*x1*(x3 + x5);
+      basis_eval[ipt + 1*npts] = sqrt_15*x*x6*y;
+      basis_eval[ipt + 2*npts] = sqrt_6*x1*x8;
+      basis_eval[ipt + 3*npts] = -x9*(x10 + x3 - 2.0*x7);
+      basis_eval[ipt + 4*npts] = sqrt_6*x11*x8;
+      basis_eval[ipt + 5*npts] = sqrt_15*x9*(x2 + x5);
+      basis_eval[ipt + 6*npts] = sqrt_10*x11*(-x10 + x2);
 
 
     
@@ -111,6 +127,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
@@ -122,18 +140,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      ang_eval_1 = sqrt_15*radial_eval*x*y*z;
-      ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
+      ang_eval_0 = sqrt_10*x1*(x3 + x5);
+      ang_eval_1 = sqrt_15*x*x6*y;
+      ang_eval_2 = sqrt_6*x1*x8;
+      ang_eval_3 = -x9*(x10 + x3 - 2.0*x7);
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      ang_eval_0 = sqrt_6*x11*x8;
+      ang_eval_1 = sqrt_15*x9*(x2 + x5);
+      ang_eval_2 = sqrt_10*x11*(-x10 + x2);
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp
index fe03a72b..bfc1379f 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_3(
+__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_gradient_3(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[8][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[8][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,45 +102,86 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.25*sqrt_10; 
+      const auto x1 = radial_eval*y; 
+      const auto x2 = x*x; 
+      const auto x3 = 3.0*x2; 
+      const auto x4 = y*y; 
+      const auto x5 = -x4; 
+      const auto x6 = x3 + x5; 
+      const auto x7 = sqrt_15*z; 
+      const auto x8 = x7*y; 
+      const auto x9 = radial_eval*x; 
+      const auto x10 = 0.25*sqrt_6; 
+      const auto x11 = z*z; 
+      const auto x12 = -4.0*x11; 
+      const auto x13 = x12 + x4; 
+      const auto x14 = -x13 - x2; 
+      const auto x15 = 0.5*z; 
+      const auto x16 = 3.0*x4; 
+      const auto x17 = -2.0*x11; 
+      const auto x18 = -x16 - x17 - x3; 
+      const auto x19 = 0.5*sqrt_15; 
+      const auto x20 = x19*z; 
+      const auto x21 = x2 + x5; 
+      const auto x22 = -x16; 
+      const auto x23 = x2 + x22; 
+      const auto x24 = x*y; 
+      const auto x25 = x0*x24; 
+      const auto x26 = 6.0*radial_eval; 
+      const auto x27 = 2.0*radial_eval; 
+      const auto x28 = -x27; 
+      const auto x29 = radial_eval_alpha*x14; 
+      const auto x30 = x10*x24*(x28 + x29); 
+      const auto x31 = -x26; 
+      const auto x32 = radial_eval_alpha*x18 + x31; 
+      const auto x33 = radial_eval_alpha*x21; 
+      const auto x34 = radial_eval*(x22 + x3); 
+      const auto x35 = radial_eval_alpha*x0*z; 
+      const auto x36 = x10*z; 
+      const auto x37 = 8.0*radial_eval + x29; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z;
-      basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
-      basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      basis_eval[ipt + 0*npts] = x0*x1*x6;
+      basis_eval[ipt + 1*npts] = x8*x9;
+      basis_eval[ipt + 2*npts] = x1*x10*x14;
+      basis_eval[ipt + 3*npts] = radial_eval*x15*x18;
+      basis_eval[ipt + 4*npts] = x10*x14*x9;
+      basis_eval[ipt + 5*npts] = radial_eval*x20*x21;
+      basis_eval[ipt + 6*npts] = x0*x23*x9;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
+      basis_x_eval[ipt + 0*npts] = x25*(radial_eval_alpha*x6 + x26);
+      basis_x_eval[ipt + 1*npts] = x8*(radial_eval + radial_eval_alpha*x2);
+      basis_x_eval[ipt + 2*npts] = x30;
+      basis_x_eval[ipt + 3*npts] = x*x15*x32;
+      basis_x_eval[ipt + 4*npts] = -x10*(radial_eval*(x13 + x3) - radial_eval_alpha*x14*x2);
+      basis_x_eval[ipt + 5*npts] = x*x20*(x27 + x33);
+      basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x2*x23 + x34);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
+      basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x4*x6 + x34);
+      basis_y_eval[ipt + 1*npts] = x*x7*(radial_eval + radial_eval_alpha*x4);
+      basis_y_eval[ipt + 2*npts] = -x10*(radial_eval*(x12 + x16 + x2) - radial_eval_alpha*x14*x4);
+      basis_y_eval[ipt + 3*npts] = x15*x32*y;
+      basis_y_eval[ipt + 4*npts] = x30;
+      basis_y_eval[ipt + 5*npts] = x20*y*(x28 + x33);
+      basis_y_eval[ipt + 6*npts] = x25*(radial_eval_alpha*x23 + x31);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4;
-      basis_z_eval[ipt + 1*npts] = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 2*npts] = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 3*npts] = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2;
-      basis_z_eval[ipt + 4*npts] = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 5*npts] = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2;
-      basis_z_eval[ipt + 6*npts] = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4;
+      basis_z_eval[ipt + 0*npts] = x35*x6*y;
+      basis_z_eval[ipt + 1*npts] = sqrt_15*x24*(radial_eval + radial_eval_alpha*x11);
+      basis_z_eval[ipt + 2*npts] = x36*x37*y;
+      basis_z_eval[ipt + 3*npts] = -1.5*radial_eval*(x17 + x2 + x4) + 0.5*radial_eval_alpha*x11*x18;
+      basis_z_eval[ipt + 4*npts] = x*x36*x37;
+      basis_z_eval[ipt + 5*npts] = x19*x21*(radial_eval + radial_eval_alpha*x11);
+      basis_z_eval[ipt + 6*npts] = x*x23*x35;
+
+
 
 
 
@@ -154,18 +198,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      ang_eval_1 = sqrt_15*radial_eval*x*y*z;
-      ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
+      ang_eval_0 = x0*x1*x6;
+      ang_eval_1 = x8*x9;
+      ang_eval_2 = x1*x10*x14;
+      ang_eval_3 = radial_eval*x15*x18;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      ang_eval_0 = x10*x14*x9;
+      ang_eval_1 = radial_eval*x20*x21;
+      ang_eval_2 = x0*x23*x9;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
@@ -176,18 +220,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      dang_eval_y_0 = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      dang_eval_z_0 = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4;
-      dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4;
-      dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2;
+      dang_eval_x_0 = x25*(radial_eval_alpha*x6 + x26);
+      dang_eval_y_0 = x0*(radial_eval_alpha*x4*x6 + x34);
+      dang_eval_z_0 = x35*x6*y;
+      dang_eval_x_1 = x8*(radial_eval + radial_eval_alpha*x2);
+      dang_eval_y_1 = x*x7*(radial_eval + radial_eval_alpha*x4);
+      dang_eval_z_1 = sqrt_15*x24*(radial_eval + radial_eval_alpha*x11);
+      dang_eval_x_2 = x30;
+      dang_eval_y_2 = -x10*(radial_eval*(x12 + x16 + x2) - radial_eval_alpha*x14*x4);
+      dang_eval_z_2 = x36*x37*y;
+      dang_eval_x_3 = x*x15*x32;
+      dang_eval_y_3 = x15*x32*y;
+      dang_eval_z_3 = -1.5*radial_eval*(x17 + x2 + x4) + 0.5*radial_eval_alpha*x11*x18;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -201,15 +245,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4;
-      dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_z_1 = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2;
-      dang_eval_x_2 = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      dang_eval_y_2 = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      dang_eval_z_2 = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4;
+      dang_eval_x_0 = -x10*(radial_eval*(x13 + x3) - radial_eval_alpha*x14*x2);
+      dang_eval_y_0 = x30;
+      dang_eval_z_0 = x*x36*x37;
+      dang_eval_x_1 = x*x20*(x27 + x33);
+      dang_eval_y_1 = x20*y*(x28 + x33);
+      dang_eval_z_1 = x19*x21*(radial_eval + radial_eval_alpha*x11);
+      dang_eval_x_2 = x0*(radial_eval_alpha*x2*x23 + x34);
+      dang_eval_y_2 = x25*(radial_eval_alpha*x23 + x31);
+      dang_eval_z_2 = x*x23*x35;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp
index 04ba8677..b85b6cb4 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_3(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_3(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,99 +111,211 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.25*sqrt_10; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x; 
+      const auto x3 = 3.0*x2; 
+      const auto x4 = y*y; 
+      const auto x5 = -x4; 
+      const auto x6 = x3 + x5; 
+      const auto x7 = sqrt_15*z; 
+      const auto x8 = x7*y; 
+      const auto x9 = x*x8; 
+      const auto x10 = 0.25*sqrt_6; 
+      const auto x11 = x10*y; 
+      const auto x12 = z*z; 
+      const auto x13 = -4.0*x12; 
+      const auto x14 = x13 + x4; 
+      const auto x15 = -x14 - x2; 
+      const auto x16 = 0.5*z; 
+      const auto x17 = 3.0*x4; 
+      const auto x18 = -2.0*x12; 
+      const auto x19 = -x17 - x18 - x3; 
+      const auto x20 = x*x10; 
+      const auto x21 = 0.5*sqrt_15; 
+      const auto x22 = x21*z; 
+      const auto x23 = x2 + x5; 
+      const auto x24 = x*x0; 
+      const auto x25 = -x17; 
+      const auto x26 = x2 + x25; 
+      const auto x27 = x*x1; 
+      const auto x28 = 6.0*radial_eval; 
+      const auto x29 = radial_eval + radial_eval_alpha*x2; 
+      const auto x30 = x*x11; 
+      const auto x31 = 2.0*radial_eval; 
+      const auto x32 = -x31; 
+      const auto x33 = radial_eval_alpha*x15; 
+      const auto x34 = x30*(x32 + x33); 
+      const auto x35 = x*x16; 
+      const auto x36 = -x28; 
+      const auto x37 = radial_eval_alpha*x19 + x36; 
+      const auto x38 = -x14 - x3; 
+      const auto x39 = x15*x2; 
+      const auto x40 = x*x22; 
+      const auto x41 = radial_eval_alpha*x23; 
+      const auto x42 = x31 + x41; 
+      const auto x43 = x25 + x3; 
+      const auto x44 = radial_eval*x43; 
+      const auto x45 = x2*x26; 
+      const auto x46 = x4*x6; 
+      const auto x47 = radial_eval_alpha*x4; 
+      const auto x48 = radial_eval + x47; 
+      const auto x49 = -x13 - x17 - x2; 
+      const auto x50 = x15*x4; 
+      const auto x51 = x32 + x41; 
+      const auto x52 = radial_eval_alpha*z; 
+      const auto x53 = sqrt_15*y; 
+      const auto x54 = radial_eval_alpha*x12; 
+      const auto x55 = 8.0*radial_eval; 
+      const auto x56 = x33 + x55; 
+      const auto x57 = -x18 - x2 - x4; 
+      const auto x58 = x12*x19; 
+      const auto x59 = x12*x23; 
+      const auto x60 = radial_eval_alpha_squared*x2; 
+      const auto x61 = radial_eval_alpha + x60; 
+      const auto x62 = x6*x61; 
+      const auto x63 = 12.0*radial_eval_alpha; 
+      const auto x64 = x2*x63; 
+      const auto x65 = x28 + x64; 
+      const auto x66 = 3.0*radial_eval_alpha; 
+      const auto x67 = 4.0*radial_eval_alpha; 
+      const auto x68 = x2*x67; 
+      const auto x69 = x31 + x68; 
+      const auto x70 = x15*x61; 
+      const auto x71 = 2.0*radial_eval_alpha; 
+      const auto x72 = x38*x71 + x70; 
+      const auto x73 = x23*x61; 
+      const auto x74 = x43*x71; 
+      const auto x75 = x26*x61 + x74; 
+      const auto x76 = 6.0*radial_eval_alpha; 
+      const auto x77 = radial_eval_alpha*x43; 
+      const auto x78 = radial_eval_alpha_squared*x46 + x77; 
+      const auto x79 = radial_eval_alpha*x49 + radial_eval_alpha_squared*x50; 
+      const auto x80 = radial_eval_alpha*x38 + radial_eval_alpha_squared*x39; 
+      const auto x81 = radial_eval_alpha_squared*x45 + x77; 
+      const auto x82 = x27*z; 
+      const auto x83 = x30*z*(radial_eval_alpha_squared*x15 + x76); 
+      const auto x84 = radial_eval_alpha_squared*x58 - x12*x76 + x36 + x57*x66; 
+      const auto x85 = x10*z; 
+      const auto x86 = 8.0*radial_eval_alpha; 
+      const auto x87 = x12*x71; 
+      const auto x88 = radial_eval_alpha_squared*x59; 
+      const auto x89 = x0*z; 
+      const auto x90 = radial_eval_alpha_squared*x4; 
+      const auto x91 = radial_eval_alpha + x90; 
+      const auto x92 = x6*x91 + x74; 
+      const auto x93 = x15*x91; 
+      const auto x94 = x49*x71 + x93; 
+      const auto x95 = x4*x63; 
+      const auto x96 = x28 + x95; 
+      const auto x97 = x4*x67; 
+      const auto x98 = x31 + x97; 
+      const auto x99 = radial_eval_alpha_squared*x12; 
+      const auto x100 = radial_eval_alpha + x99; 
+      const auto x101 = x100*x6; 
+      const auto x102 = 16.0*radial_eval_alpha*x12 + x100*x15; 
+      const auto x103 = x102 + x55; 
+      const auto x104 = x100*x19 + x57*x76; 
+      const auto x105 = x23*(x100 + x71); 
+      const auto x106 = x100*x26; 
+      const auto x107 = -x95; 
+      const auto x108 = -x97; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z;
-      basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
-      basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      basis_eval[ipt + 0*npts] = radial_eval*x1*x6;
+      basis_eval[ipt + 1*npts] = radial_eval*x9;
+      basis_eval[ipt + 2*npts] = radial_eval*x11*x15;
+      basis_eval[ipt + 3*npts] = radial_eval*x16*x19;
+      basis_eval[ipt + 4*npts] = radial_eval*x15*x20;
+      basis_eval[ipt + 5*npts] = radial_eval*x22*x23;
+      basis_eval[ipt + 6*npts] = radial_eval*x24*x26;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
+      basis_x_eval[ipt + 0*npts] = x27*(radial_eval_alpha*x6 + x28);
+      basis_x_eval[ipt + 1*npts] = x29*x8;
+      basis_x_eval[ipt + 2*npts] = x34;
+      basis_x_eval[ipt + 3*npts] = x35*x37;
+      basis_x_eval[ipt + 4*npts] = x10*(radial_eval*x38 + radial_eval_alpha*x39);
+      basis_x_eval[ipt + 5*npts] = x40*x42;
+      basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x45 + x44);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
+      basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x46 + x44);
+      basis_y_eval[ipt + 1*npts] = x*x48*x7;
+      basis_y_eval[ipt + 2*npts] = x10*(radial_eval*x49 + radial_eval_alpha*x50);
+      basis_y_eval[ipt + 3*npts] = x16*x37*y;
+      basis_y_eval[ipt + 4*npts] = x34;
+      basis_y_eval[ipt + 5*npts] = x22*x51*y;
+      basis_y_eval[ipt + 6*npts] = x27*(radial_eval_alpha*x26 + x36);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4;
-      basis_z_eval[ipt + 1*npts] = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 2*npts] = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 3*npts] = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2;
-      basis_z_eval[ipt + 4*npts] = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 5*npts] = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2;
-      basis_z_eval[ipt + 6*npts] = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4;
+      basis_z_eval[ipt + 0*npts] = x1*x52*x6;
+      basis_z_eval[ipt + 1*npts] = x*x53*(radial_eval + x54);
+      basis_z_eval[ipt + 2*npts] = x11*x56*z;
+      basis_z_eval[ipt + 3*npts] = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58;
+      basis_z_eval[ipt + 4*npts] = x20*x56*z;
+      basis_z_eval[ipt + 5*npts] = x21*(radial_eval*x23 + radial_eval_alpha*x59);
+      basis_z_eval[ipt + 6*npts] = x24*x26*x52;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = sqrt_10*y*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4;
-      basis_xx_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x);
-      basis_xx_eval[ipt + 2*npts] = sqrt_6*y*(-2*radial_eval - 4*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4;
-      basis_xx_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_xx_eval[ipt + 4*npts] = sqrt_6*x*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4;
-      basis_xx_eval[ipt + 5*npts] = sqrt_15*z*(2*radial_eval + 4*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2;
-      basis_xx_eval[ipt + 6*npts] = sqrt_10*x*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4;
+      basis_xx_eval[ipt + 0*npts] = x1*(x62 + x65);
+      basis_xx_eval[ipt + 1*npts] = x9*(x60 + x66);
+      basis_xx_eval[ipt + 2*npts] = x11*(x15*x61 - x69);
+      basis_xx_eval[ipt + 3*npts] = x16*(x19*x61 - x65);
+      basis_xx_eval[ipt + 4*npts] = x20*(x36 + x72);
+      basis_xx_eval[ipt + 5*npts] = x22*(x69 + x73);
+      basis_xx_eval[ipt + 6*npts] = x24*(x28 + x75);
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = sqrt_10*x*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4;
-      basis_xy_eval[ipt + 1*npts] = sqrt_15*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y);
-      basis_xy_eval[ipt + 2*npts] = sqrt_6*x*(-2*radial_eval - 2*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4;
-      basis_xy_eval[ipt + 3*npts] = x*y*z*(-12*radial_eval_alpha - radial_eval_alpha_squared*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_xy_eval[ipt + 4*npts] = sqrt_6*y*(-2*radial_eval - 2*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4;
-      basis_xy_eval[ipt + 5*npts] = sqrt_15*radial_eval_alpha_squared*x*y*z*(x*x - y*y)/2;
-      basis_xy_eval[ipt + 6*npts] = sqrt_10*y*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4;
+      basis_xy_eval[ipt + 0*npts] = x24*(x28 + x4*x76 + x78);
+      basis_xy_eval[ipt + 1*npts] = x7*(radial_eval_alpha_squared*x2*x4 + x29 + x47);
+      basis_xy_eval[ipt + 2*npts] = x20*(x32 - x4*x71 + x79);
+      basis_xy_eval[ipt + 3*npts] = x35*y*(radial_eval_alpha_squared*x19 - x63);
+      basis_xy_eval[ipt + 4*npts] = x11*(-x2*x71 + x32 + x80);
+      basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x23*x40*y;
+      basis_xy_eval[ipt + 6*npts] = x1*(-x2*x76 + x36 + x81);
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = sqrt_10*x*y*z*(6*radial_eval_alpha + radial_eval_alpha_squared*(3*x*x - y*y))/4;
-      basis_xz_eval[ipt + 1*npts] = sqrt_15*y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z);
-      basis_xz_eval[ipt + 2*npts] = sqrt_6*x*y*z*(6*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 4*z*z))/4;
-      basis_xz_eval[ipt + 3*npts] = x*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y - 3*radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*z*z + 2*radial_eval_alpha_squared*z*z*z*z)/2;
-      basis_xz_eval[ipt + 4*npts] = sqrt_6*z*(8*radial_eval + 8*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4;
-      basis_xz_eval[ipt + 5*npts] = sqrt_15*x*(2*radial_eval + 2*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - y*y) + radial_eval_alpha_squared*z*z*(x*x - y*y))/2;
-      basis_xz_eval[ipt + 6*npts] = sqrt_10*z*(3*radial_eval_alpha*(x*x - y*y) + radial_eval_alpha_squared*x*x*(x*x - 3*y*y))/4;
+      basis_xz_eval[ipt + 0*npts] = x82*(radial_eval_alpha_squared*x6 + x76);
+      basis_xz_eval[ipt + 1*npts] = x53*(radial_eval_alpha_squared*x12*x2 + x29 + x54);
+      basis_xz_eval[ipt + 2*npts] = x83;
+      basis_xz_eval[ipt + 3*npts] = 0.5*x*x84;
+      basis_xz_eval[ipt + 4*npts] = x85*(x2*x86 + x55 + x80);
+      basis_xz_eval[ipt + 5*npts] = x*x21*(x42 + x87 + x88);
+      basis_xz_eval[ipt + 6*npts] = x81*x89;
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = sqrt_10*y*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4;
-      basis_yy_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y);
-      basis_yy_eval[ipt + 2*npts] = sqrt_6*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4;
-      basis_yy_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_yy_eval[ipt + 4*npts] = sqrt_6*x*(-2*radial_eval - 4*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4;
-      basis_yy_eval[ipt + 5*npts] = sqrt_15*z*(-2*radial_eval - 4*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2;
-      basis_yy_eval[ipt + 6*npts] = sqrt_10*x*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4;
+      basis_yy_eval[ipt + 0*npts] = x1*(x36 + x92);
+      basis_yy_eval[ipt + 1*npts] = x9*(x66 + x90);
+      basis_yy_eval[ipt + 2*npts] = x11*(x36 + x94);
+      basis_yy_eval[ipt + 3*npts] = x16*(x19*x91 - x96);
+      basis_yy_eval[ipt + 4*npts] = x20*(x15*x91 - x98);
+      basis_yy_eval[ipt + 5*npts] = x22*(x23*x91 - x98);
+      basis_yy_eval[ipt + 6*npts] = x24*(x26*x91 - x96);
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = sqrt_10*z*(-3*radial_eval_alpha*(-x*x + y*y) + radial_eval_alpha_squared*y*y*(3*x*x - y*y))/4;
-      basis_yz_eval[ipt + 1*npts] = sqrt_15*x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z);
-      basis_yz_eval[ipt + 2*npts] = sqrt_6*z*(8*radial_eval + 8*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4;
-      basis_yz_eval[ipt + 3*npts] = y*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y - 3*radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*z*z + 2*radial_eval_alpha_squared*z*z*z*z)/2;
-      basis_yz_eval[ipt + 4*npts] = sqrt_6*x*y*z*(6*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 4*z*z))/4;
-      basis_yz_eval[ipt + 5*npts] = sqrt_15*y*(-2*radial_eval - 2*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - y*y) + radial_eval_alpha_squared*z*z*(x*x - y*y))/2;
-      basis_yz_eval[ipt + 6*npts] = sqrt_10*x*y*z*(-6*radial_eval_alpha + radial_eval_alpha_squared*(x*x - 3*y*y))/4;
+      basis_yz_eval[ipt + 0*npts] = x78*x89;
+      basis_yz_eval[ipt + 1*npts] = sqrt_15*x*(radial_eval_alpha_squared*x12*x4 + x48 + x54);
+      basis_yz_eval[ipt + 2*npts] = x85*(x4*x86 + x55 + x79);
+      basis_yz_eval[ipt + 3*npts] = 0.5*x84*y;
+      basis_yz_eval[ipt + 4*npts] = x83;
+      basis_yz_eval[ipt + 5*npts] = x21*y*(x51 - x87 + x88);
+      basis_yz_eval[ipt + 6*npts] = x82*(radial_eval_alpha_squared*x26 - x76);
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = sqrt_10*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x - y*y)/4;
-      basis_zz_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z);
-      basis_zz_eval[ipt + 2*npts] = sqrt_6*y*(8*radial_eval + 16*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 4*z*z))/4;
-      basis_zz_eval[ipt + 3*npts] = z*(12*radial_eval - 6*radial_eval_alpha*(x*x + y*y - 2*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_zz_eval[ipt + 4*npts] = sqrt_6*x*(8*radial_eval + 16*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 4*z*z))/4;
-      basis_zz_eval[ipt + 5*npts] = sqrt_15*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2;
-      basis_zz_eval[ipt + 6*npts] = sqrt_10*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - 3*y*y)/4;
+      basis_zz_eval[ipt + 0*npts] = x1*x101;
+      basis_zz_eval[ipt + 1*npts] = x9*(x66 + x99);
+      basis_zz_eval[ipt + 2*npts] = x103*x11;
+      basis_zz_eval[ipt + 3*npts] = x16*(12.0*radial_eval + x104);
+      basis_zz_eval[ipt + 4*npts] = x103*x20;
+      basis_zz_eval[ipt + 5*npts] = x105*x22;
+      basis_zz_eval[ipt + 6*npts] = x106*x24;
+
+
 
 
 
@@ -216,18 +331,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      ang_eval_1 = sqrt_15*radial_eval*x*y*z;
-      ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
+      ang_eval_0 = radial_eval*x1*x6;
+      ang_eval_1 = radial_eval*x9;
+      ang_eval_2 = radial_eval*x11*x15;
+      ang_eval_3 = radial_eval*x16*x19;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      ang_eval_0 = radial_eval*x15*x20;
+      ang_eval_1 = radial_eval*x22*x23;
+      ang_eval_2 = radial_eval*x24*x26;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
@@ -238,18 +353,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      dang_eval_y_0 = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      dang_eval_z_0 = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4;
-      dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4;
-      dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2;
+      dang_eval_x_0 = x27*(radial_eval_alpha*x6 + x28);
+      dang_eval_y_0 = x0*(radial_eval_alpha*x46 + x44);
+      dang_eval_z_0 = x1*x52*x6;
+      dang_eval_x_1 = x29*x8;
+      dang_eval_y_1 = x*x48*x7;
+      dang_eval_z_1 = x*x53*(radial_eval + x54);
+      dang_eval_x_2 = x34;
+      dang_eval_y_2 = x10*(radial_eval*x49 + radial_eval_alpha*x50);
+      dang_eval_z_2 = x11*x56*z;
+      dang_eval_x_3 = x35*x37;
+      dang_eval_y_3 = x16*x37*y;
+      dang_eval_z_3 = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -263,15 +378,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4;
-      dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_z_1 = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2;
-      dang_eval_x_2 = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      dang_eval_y_2 = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      dang_eval_z_2 = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4;
+      dang_eval_x_0 = x10*(radial_eval*x38 + radial_eval_alpha*x39);
+      dang_eval_y_0 = x34;
+      dang_eval_z_0 = x20*x56*z;
+      dang_eval_x_1 = x40*x42;
+      dang_eval_y_1 = x22*x51*y;
+      dang_eval_z_1 = x21*(radial_eval*x23 + radial_eval_alpha*x59);
+      dang_eval_x_2 = x0*(radial_eval_alpha*x45 + x44);
+      dang_eval_y_2 = x27*(radial_eval_alpha*x26 + x36);
+      dang_eval_z_2 = x24*x26*x52;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp
new file mode 100644
index 00000000..a58a8b4e
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp
@@ -0,0 +1,514 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_3(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = 0.25*sqrt_10; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x; 
+      const auto x3 = x2; 
+      const auto x4 = 3.0*x3; 
+      const auto x5 = y*y; 
+      const auto x6 = x5; 
+      const auto x7 = -x6; 
+      const auto x8 = x4 + x7; 
+      const auto x9 = sqrt_15*z; 
+      const auto x10 = x9*y; 
+      const auto x11 = x*x10; 
+      const auto x12 = 0.25*sqrt_6; 
+      const auto x13 = x12*y; 
+      const auto x14 = z*z; 
+      const auto x15 = x14; 
+      const auto x16 = -4.0*x15; 
+      const auto x17 = x16 + x6; 
+      const auto x18 = -x17 - x3; 
+      const auto x19 = 0.5*z; 
+      const auto x20 = 3.0*x6; 
+      const auto x21 = -2.0*x15; 
+      const auto x22 = -x20 - x21 - x4; 
+      const auto x23 = x*x12; 
+      const auto x24 = 0.5*sqrt_15; 
+      const auto x25 = x24*z; 
+      const auto x26 = x3 + x7; 
+      const auto x27 = x*x0; 
+      const auto x28 = -x20; 
+      const auto x29 = x28 + x3; 
+      const auto x30 = x*x1; 
+      const auto x31 = 6.0*radial_eval; 
+      const auto x32 = radial_eval + radial_eval_alpha*x3; 
+      const auto x33 = x*x13; 
+      const auto x34 = 2.0*radial_eval; 
+      const auto x35 = -x34; 
+      const auto x36 = radial_eval_alpha*x18; 
+      const auto x37 = x33*(x35 + x36); 
+      const auto x38 = x*x19; 
+      const auto x39 = -x31; 
+      const auto x40 = radial_eval_alpha*x22 + x39; 
+      const auto x41 = -x17 - x4; 
+      const auto x42 = x18*x3; 
+      const auto x43 = x*x25; 
+      const auto x44 = radial_eval_alpha*x26; 
+      const auto x45 = x34 + x44; 
+      const auto x46 = x28 + x4; 
+      const auto x47 = radial_eval*x46; 
+      const auto x48 = x29*x3; 
+      const auto x49 = x6*x8; 
+      const auto x50 = x*x9; 
+      const auto x51 = radial_eval_alpha*x6; 
+      const auto x52 = radial_eval + x51; 
+      const auto x53 = -x16 - x20 - x3; 
+      const auto x54 = x18*x6; 
+      const auto x55 = x35 + x44; 
+      const auto x56 = radial_eval_alpha*z; 
+      const auto x57 = sqrt_15*y; 
+      const auto x58 = x*x57; 
+      const auto x59 = radial_eval_alpha*x15; 
+      const auto x60 = 8.0*radial_eval; 
+      const auto x61 = x36 + x60; 
+      const auto x62 = -x21 - x3 - x6; 
+      const auto x63 = x15*x22; 
+      const auto x64 = x15*x26; 
+      const auto x65 = radial_eval_alpha_squared*x3; 
+      const auto x66 = radial_eval_alpha + x65; 
+      const auto x67 = x66*x8; 
+      const auto x68 = 12.0*radial_eval_alpha; 
+      const auto x69 = x3*x68; 
+      const auto x70 = x31 + x69; 
+      const auto x71 = 3.0*radial_eval_alpha; 
+      const auto x72 = 4.0*radial_eval_alpha; 
+      const auto x73 = x3*x72; 
+      const auto x74 = x34 + x73; 
+      const auto x75 = x18*x66; 
+      const auto x76 = 2.0*radial_eval_alpha; 
+      const auto x77 = x41*x76 + x75; 
+      const auto x78 = x26*x66; 
+      const auto x79 = x46*x76; 
+      const auto x80 = x29*x66 + x79; 
+      const auto x81 = 6.0*radial_eval_alpha; 
+      const auto x82 = radial_eval_alpha*x46; 
+      const auto x83 = radial_eval_alpha_squared*x49 + x82; 
+      const auto x84 = x3*x6; 
+      const auto x85 = radial_eval_alpha*x53 + radial_eval_alpha_squared*x54; 
+      const auto x86 = radial_eval_alpha*x41 + radial_eval_alpha_squared*x42; 
+      const auto x87 = radial_eval_alpha_squared*x48 + x82; 
+      const auto x88 = x30*z; 
+      const auto x89 = x15*x3; 
+      const auto x90 = x33*z*(radial_eval_alpha_squared*x18 + x81); 
+      const auto x91 = radial_eval_alpha_squared*x63 - x15*x81 + x39 + x62*x71; 
+      const auto x92 = x12*z; 
+      const auto x93 = 8.0*radial_eval_alpha; 
+      const auto x94 = x15*x76; 
+      const auto x95 = radial_eval_alpha_squared*x64; 
+      const auto x96 = x0*z; 
+      const auto x97 = radial_eval_alpha_squared*x6; 
+      const auto x98 = radial_eval_alpha + x97; 
+      const auto x99 = x79 + x8*x98; 
+      const auto x100 = x18*x98; 
+      const auto x101 = x100 + x53*x76; 
+      const auto x102 = x6*x68; 
+      const auto x103 = x102 + x31; 
+      const auto x104 = x6*x72; 
+      const auto x105 = x104 + x34; 
+      const auto x106 = x15*x6; 
+      const auto x107 = radial_eval_alpha_squared*x15; 
+      const auto x108 = radial_eval_alpha + x107; 
+      const auto x109 = x108*x8; 
+      const auto x110 = 16.0*radial_eval_alpha*x15; 
+      const auto x111 = x108*x18 + x110; 
+      const auto x112 = x111 + x60; 
+      const auto x113 = x108*x22 + x62*x81; 
+      const auto x114 = x108*x26; 
+      const auto x115 = x114 + x26*x76; 
+      const auto x116 = x108*x29; 
+      const auto x117 = x107 + x97; 
+      const auto x118 = -x73; 
+      const auto x119 = -x102; 
+      const auto x120 = -x69; 
+      const auto x121 = x119 + x120; 
+      const auto x122 = -x104; 
+      const auto x123 = x122 + x26*x98 + x73 + x78; 
+      const auto x124 = 3.0*radial_eval_alpha_squared; 
+      const auto x125 = x*(radial_eval_alpha_cubed*(x*x) + x124); 
+      const auto x126 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; 
+      const auto x127 = x126*x8; 
+      const auto x128 = radial_eval_alpha_cubed*x15 + radial_eval_alpha_squared; 
+      const auto x129 = x128*x8; 
+      const auto x130 = 2.0*x; 
+      const auto x131 = radial_eval_alpha_squared*x130; 
+      const auto x132 = 6.0*x; 
+      const auto x133 = 24.0*radial_eval_alpha; 
+      const auto x134 = x*x133 + 18.0*x*x66 + x108*x132 + x132*x98; 
+      const auto x135 = 4.0*radial_eval_alpha_squared; 
+      const auto x136 = x*x93; 
+      const auto x137 = 16.0*radial_eval_alpha_squared; 
+      const auto x138 = x132*x66; 
+      const auto x139 = x130*x98; 
+      const auto x140 = x108*x130; 
+      const auto x141 = x126*x18; 
+      const auto x142 = x128*x18; 
+      const auto x143 = x125*x18; 
+      const auto x144 = 12.0*radial_eval_alpha_squared; 
+      const auto x145 = x110 - x135*x84; 
+      const auto x146 = x126*x26; 
+      const auto x147 = x128*x26; 
+      const auto x148 = x46*x98; 
+      const auto x149 = x46*x66; 
+      const auto x150 = x126*x29; 
+      const auto x151 = x128*x29; 
+      const auto x152 = x144*x84; 
+      const auto x153 = x108*x46 + x119 + x69; 
+      const auto x154 = y*(radial_eval_alpha_cubed*(y*y) + x124); 
+      const auto x155 = radial_eval_alpha_cubed*x3 + radial_eval_alpha_squared; 
+      const auto x156 = x155*x8; 
+      const auto x157 = x65 + x81; 
+      const auto x158 = x154*x18; 
+      const auto x159 = x155*x18; 
+      const auto x160 = x133*y; 
+      const auto x161 = 6.0*y; 
+      const auto x162 = x161*x66; 
+      const auto x163 = 18.0*x98*y; 
+      const auto x164 = x108*x161; 
+      const auto x165 = 2.0*y; 
+      const auto x166 = radial_eval_alpha_squared*x165; 
+      const auto x167 = -x108*x165 - x161*x98 - x165*x66 - x93*y; 
+      const auto x168 = x155*x26; 
+      const auto x169 = x155*x29; 
+      const auto x170 = x144*z; 
+      const auto x171 = 2.0*radial_eval_alpha_squared*z; 
+      const auto x172 = x171*x46; 
+      const auto x173 = z*(radial_eval_alpha_cubed*(z*z) + x124); 
+      const auto x174 = x135*z; 
+      const auto x175 = 8.0*z; 
+      const auto x176 = 24.0*x108*z + x141*z + x159*z + x173*x18 + x175*x66 + x175*x98 + 32.0*x56; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x1*x8;
+      basis_eval[ipt + 1*npts] = radial_eval*x11;
+      basis_eval[ipt + 2*npts] = radial_eval*x13*x18;
+      basis_eval[ipt + 3*npts] = radial_eval*x19*x22;
+      basis_eval[ipt + 4*npts] = radial_eval*x18*x23;
+      basis_eval[ipt + 5*npts] = radial_eval*x25*x26;
+      basis_eval[ipt + 6*npts] = radial_eval*x27*x29;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = x30*(radial_eval_alpha*x8 + x31);
+      basis_x_eval[ipt + 1*npts] = x10*x32;
+      basis_x_eval[ipt + 2*npts] = x37;
+      basis_x_eval[ipt + 3*npts] = x38*x40;
+      basis_x_eval[ipt + 4*npts] = x12*(radial_eval*x41 + radial_eval_alpha*x42);
+      basis_x_eval[ipt + 5*npts] = x43*x45;
+      basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x48 + x47);
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x49 + x47);
+      basis_y_eval[ipt + 1*npts] = x50*x52;
+      basis_y_eval[ipt + 2*npts] = x12*(radial_eval*x53 + radial_eval_alpha*x54);
+      basis_y_eval[ipt + 3*npts] = x19*x40*y;
+      basis_y_eval[ipt + 4*npts] = x37;
+      basis_y_eval[ipt + 5*npts] = x25*x55*y;
+      basis_y_eval[ipt + 6*npts] = x30*(radial_eval_alpha*x29 + x39);
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x1*x56*x8;
+      basis_z_eval[ipt + 1*npts] = x58*(radial_eval + x59);
+      basis_z_eval[ipt + 2*npts] = x13*x61*z;
+      basis_z_eval[ipt + 3*npts] = 1.5*radial_eval*x62 + 0.5*radial_eval_alpha*x63;
+      basis_z_eval[ipt + 4*npts] = x23*x61*z;
+      basis_z_eval[ipt + 5*npts] = x24*(radial_eval*x26 + radial_eval_alpha*x64);
+      basis_z_eval[ipt + 6*npts] = x27*x29*x56;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x1*(x67 + x70);
+      basis_xx_eval[ipt + 1*npts] = x11*(x65 + x71);
+      basis_xx_eval[ipt + 2*npts] = x13*(x18*x66 - x74);
+      basis_xx_eval[ipt + 3*npts] = x19*(x22*x66 - x70);
+      basis_xx_eval[ipt + 4*npts] = x23*(x39 + x77);
+      basis_xx_eval[ipt + 5*npts] = x25*(x74 + x78);
+      basis_xx_eval[ipt + 6*npts] = x27*(x31 + x80);
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x27*(x31 + x6*x81 + x83);
+      basis_xy_eval[ipt + 1*npts] = x9*(radial_eval_alpha_squared*x84 + x32 + x51);
+      basis_xy_eval[ipt + 2*npts] = x23*(x35 - x6*x76 + x85);
+      basis_xy_eval[ipt + 3*npts] = x38*y*(radial_eval_alpha_squared*x22 - x68);
+      basis_xy_eval[ipt + 4*npts] = x13*(-x3*x76 + x35 + x86);
+      basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x26*x43*y;
+      basis_xy_eval[ipt + 6*npts] = x1*(-x3*x81 + x39 + x87);
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x88*(radial_eval_alpha_squared*x8 + x81);
+      basis_xz_eval[ipt + 1*npts] = x57*(radial_eval_alpha_squared*x89 + x32 + x59);
+      basis_xz_eval[ipt + 2*npts] = x90;
+      basis_xz_eval[ipt + 3*npts] = 0.5*x*x91;
+      basis_xz_eval[ipt + 4*npts] = x92*(x3*x93 + x60 + x86);
+      basis_xz_eval[ipt + 5*npts] = x*x24*(x45 + x94 + x95);
+      basis_xz_eval[ipt + 6*npts] = x87*x96;
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x1*(x39 + x99);
+      basis_yy_eval[ipt + 1*npts] = x11*(x71 + x97);
+      basis_yy_eval[ipt + 2*npts] = x13*(x101 + x39);
+      basis_yy_eval[ipt + 3*npts] = x19*(-x103 + x22*x98);
+      basis_yy_eval[ipt + 4*npts] = x23*(-x105 + x18*x98);
+      basis_yy_eval[ipt + 5*npts] = x25*(-x105 + x26*x98);
+      basis_yy_eval[ipt + 6*npts] = x27*(-x103 + x29*x98);
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x83*x96;
+      basis_yz_eval[ipt + 1*npts] = sqrt_15*x*(radial_eval_alpha_squared*x106 + x52 + x59);
+      basis_yz_eval[ipt + 2*npts] = x92*(x6*x93 + x60 + x85);
+      basis_yz_eval[ipt + 3*npts] = 0.5*x91*y;
+      basis_yz_eval[ipt + 4*npts] = x90;
+      basis_yz_eval[ipt + 5*npts] = x24*y*(x55 - x94 + x95);
+      basis_yz_eval[ipt + 6*npts] = x88*(radial_eval_alpha_squared*x29 - x81);
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x1*x109;
+      basis_zz_eval[ipt + 1*npts] = x11*(x107 + x71);
+      basis_zz_eval[ipt + 2*npts] = x112*x13;
+      basis_zz_eval[ipt + 3*npts] = x19*(12.0*radial_eval + x113);
+      basis_zz_eval[ipt + 4*npts] = x112*x23;
+      basis_zz_eval[ipt + 5*npts] = x115*x25;
+      basis_zz_eval[ipt + 6*npts] = x116*x27;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x1*(x109 + x67 + x69 + x99);
+      basis_lapl_eval[ipt + 1*npts] = x11*(9.0*radial_eval_alpha + x117 + x65);
+      basis_lapl_eval[ipt + 2*npts] = x13*(x101 + x111 + x118 + x75);
+      basis_lapl_eval[ipt + 3*npts] = x19*(x113 + x121 + x22*x66 + x22*x98);
+      basis_lapl_eval[ipt + 4*npts] = x23*(x100 + x111 + x122 + x77);
+      basis_lapl_eval[ipt + 5*npts] = x25*(x115 + x123);
+      basis_lapl_eval[ipt + 6*npts] = x27*(x116 + x119 + x29*x98 + x80);
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x1*(x*x127 + x*x129 + x125*x8 + x131*x46 + x134);
+      basis_lapl_x_eval[ipt + 1*npts] = x10*(x*x125 + x117 + x126*x2 + x128*x2 + x135*x3 + 3.0*x66 + x81);
+      basis_lapl_x_eval[ipt + 2*npts] = x13*(x*x137*x15 + x*x141 + x*x142 + x131*x53 - x136 - x138 - x139 - x140 + x143);
+      basis_lapl_x_eval[ipt + 3*npts] = x19*(6.0*radial_eval_alpha_squared*x*x62 + x*x126*x22 + x*x128*x22 - x*x144*x6 + x125*x22 - x134);
+      basis_lapl_x_eval[ipt + 4*npts] = x12*(x*x143 + x108*x41 + x120 + x122 + x137*x89 + x141*x2 + x142*x2 + x145 + 3.0*x41*x66 + x41*x98);
+      basis_lapl_x_eval[ipt + 5*npts] = x25*(-x*x135*x6 + x*x146 + x*x147 + x125*x26 + x131*x26 + x136 + x138 + x139 + x140);
+      basis_lapl_x_eval[ipt + 6*npts] = x0*(x*x125*x29 + x148 + 3.0*x149 + x150*x2 + x151*x2 - x152 + x153);
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x0*(x129*x5 + 3.0*x148 + x149 + x152 + x153 + x154*x8*y + x156*x5);
+      basis_lapl_y_eval[ipt + 1*npts] = x50*(x107 + x128*x5 + x135*x6 + x154*y + x155*x5 + x157 + 3.0*x98);
+      basis_lapl_y_eval[ipt + 2*npts] = x12*(x106*x137 + x108*x53 + x118 + x119 + x142*x5 + x145 + x158*y + x159*x5 + x53*x66 + 3.0*x53*x98);
+      basis_lapl_y_eval[ipt + 3*npts] = -x19*(-6.0*radial_eval_alpha_squared*x62*y - x128*x22*y + x144*x3*y - x154*x22 - x155*x22*y + x160 + x162 + x163 + x164);
+      basis_lapl_y_eval[ipt + 4*npts] = x23*(x137*x15*y + x142*y + x158 + x159*y + x166*x41 + x167);
+      basis_lapl_y_eval[ipt + 5*npts] = x25*(x135*x3*y + x147*y + x154*x26 + x166*x26 + x167 + x168*y);
+      basis_lapl_y_eval[ipt + 6*npts] = x27*(x151*y + x154*x29 - x160 - x162 - x163 - x164 + x166*x46 + x169*y);
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x1*(x127*z + x156*z + x170*x3 + x172 + x173*x8);
+      basis_lapl_z_eval[ipt + 1*npts] = x58*(3.0*x108 + x126*x14 + x135*x15 + x14*x155 + x157 + x173*z + x97);
+      basis_lapl_z_eval[ipt + 2*npts] = x13*(x171*x53 - x174*x3 + x176);
+      basis_lapl_z_eval[ipt + 3*npts] = -0.5*x106*x144 + 4.5*x108*x62 + 0.5*x121 + 0.5*x126*x14*x22 + 0.5*x133*x15 + 0.5*x14*x155*x22 - 0.5*x144*x89 + 0.5*x173*x22*z + 1.5*x62*x66 + 1.5*x62*x98;
+      basis_lapl_z_eval[ipt + 4*npts] = x23*(x171*x41 - x174*x6 + x176);
+      basis_lapl_z_eval[ipt + 5*npts] = x24*(-x106*x135 + 3.0*x114 + x123 + x135*x89 + x14*x146 + x14*x168 + x173*x26*z);
+      basis_lapl_z_eval[ipt + 6*npts] = x27*(x150*z + x169*z - x170*x6 + x172 + x173*x29);
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+      double ang_eval_3;
+
+
+      ang_eval_0 = radial_eval*x1*x8;
+      ang_eval_1 = radial_eval*x11;
+      ang_eval_2 = radial_eval*x13*x18;
+      ang_eval_3 = radial_eval*x19*x22;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+      basis_eval[ipt + 3*npts] = ang_eval_3;
+
+      ang_eval_0 = radial_eval*x18*x23;
+      ang_eval_1 = radial_eval*x25*x26;
+      ang_eval_2 = radial_eval*x27*x29;
+      basis_eval[ipt + 4*npts] = ang_eval_0;
+      basis_eval[ipt + 5*npts] = ang_eval_1;
+      basis_eval[ipt + 6*npts] = ang_eval_2;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+      double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
+
+      dang_eval_x_0 = x30*(radial_eval_alpha*x8 + x31);
+      dang_eval_y_0 = x0*(radial_eval_alpha*x49 + x47);
+      dang_eval_z_0 = x1*x56*x8;
+      dang_eval_x_1 = x10*x32;
+      dang_eval_y_1 = x50*x52;
+      dang_eval_z_1 = x58*(radial_eval + x59);
+      dang_eval_x_2 = x37;
+      dang_eval_y_2 = x12*(radial_eval*x53 + radial_eval_alpha*x54);
+      dang_eval_z_2 = x13*x61*z;
+      dang_eval_x_3 = x38*x40;
+      dang_eval_y_3 = x19*x40*y;
+      dang_eval_z_3 = 1.5*radial_eval*x62 + 0.5*radial_eval_alpha*x63;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 3*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x12*(radial_eval*x41 + radial_eval_alpha*x42);
+      dang_eval_y_0 = x37;
+      dang_eval_z_0 = x23*x61*z;
+      dang_eval_x_1 = x43*x45;
+      dang_eval_y_1 = x25*x55*y;
+      dang_eval_z_1 = x24*(radial_eval*x26 + radial_eval_alpha*x64);
+      dang_eval_x_2 = x0*(radial_eval_alpha*x48 + x47);
+      dang_eval_y_2 = x30*(radial_eval_alpha*x29 + x39);
+      dang_eval_z_2 = x27*x29*x56;
+      basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 5*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 5*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 5*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 6*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 6*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 6*npts] = dang_eval_z_2;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp
index 18dff71c..d5f8f3af 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_3(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_3(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,54 +106,166 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.25*sqrt_10; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x; 
+      const auto x3 = 3.0*x2; 
+      const auto x4 = y*y; 
+      const auto x5 = -x4; 
+      const auto x6 = x3 + x5; 
+      const auto x7 = sqrt_15*z; 
+      const auto x8 = x7*y; 
+      const auto x9 = x*x8; 
+      const auto x10 = 0.25*sqrt_6; 
+      const auto x11 = x10*y; 
+      const auto x12 = z*z; 
+      const auto x13 = -4.0*x12; 
+      const auto x14 = x13 + x4; 
+      const auto x15 = -x14 - x2; 
+      const auto x16 = 0.5*z; 
+      const auto x17 = 3.0*x4; 
+      const auto x18 = -2.0*x12; 
+      const auto x19 = -x17 - x18 - x3; 
+      const auto x20 = x*x10; 
+      const auto x21 = 0.5*sqrt_15; 
+      const auto x22 = x21*z; 
+      const auto x23 = x2 + x5; 
+      const auto x24 = x*x0; 
+      const auto x25 = -x17; 
+      const auto x26 = x2 + x25; 
+      const auto x27 = x*x1; 
+      const auto x28 = 6.0*radial_eval; 
+      const auto x29 = radial_eval + radial_eval_alpha*x2; 
+      const auto x30 = x*x11; 
+      const auto x31 = 2.0*radial_eval; 
+      const auto x32 = -x31; 
+      const auto x33 = radial_eval_alpha*x15; 
+      const auto x34 = x30*(x32 + x33); 
+      const auto x35 = x*x16; 
+      const auto x36 = -x28; 
+      const auto x37 = radial_eval_alpha*x19 + x36; 
+      const auto x38 = -x14 - x3; 
+      const auto x39 = x15*x2; 
+      const auto x40 = x*x22; 
+      const auto x41 = radial_eval_alpha*x23; 
+      const auto x42 = x31 + x41; 
+      const auto x43 = x25 + x3; 
+      const auto x44 = radial_eval*x43; 
+      const auto x45 = x2*x26; 
+      const auto x46 = x4*x6; 
+      const auto x47 = radial_eval_alpha*x4; 
+      const auto x48 = radial_eval + x47; 
+      const auto x49 = -x13 - x17 - x2; 
+      const auto x50 = x15*x4; 
+      const auto x51 = x32 + x41; 
+      const auto x52 = radial_eval_alpha*z; 
+      const auto x53 = sqrt_15*y; 
+      const auto x54 = radial_eval_alpha*x12; 
+      const auto x55 = 8.0*radial_eval; 
+      const auto x56 = x33 + x55; 
+      const auto x57 = -x18 - x2 - x4; 
+      const auto x58 = x12*x19; 
+      const auto x59 = x12*x23; 
+      const auto x60 = radial_eval_alpha_squared*x2; 
+      const auto x61 = radial_eval_alpha + x60; 
+      const auto x62 = x6*x61; 
+      const auto x63 = 12.0*radial_eval_alpha; 
+      const auto x64 = x2*x63; 
+      const auto x65 = x28 + x64; 
+      const auto x66 = 3.0*radial_eval_alpha; 
+      const auto x67 = 4.0*radial_eval_alpha; 
+      const auto x68 = x2*x67; 
+      const auto x69 = x31 + x68; 
+      const auto x70 = x15*x61; 
+      const auto x71 = 2.0*radial_eval_alpha; 
+      const auto x72 = x38*x71 + x70; 
+      const auto x73 = x23*x61; 
+      const auto x74 = x43*x71; 
+      const auto x75 = x26*x61 + x74; 
+      const auto x76 = 6.0*radial_eval_alpha; 
+      const auto x77 = radial_eval_alpha*x43; 
+      const auto x78 = radial_eval_alpha_squared*x46 + x77; 
+      const auto x79 = radial_eval_alpha*x49 + radial_eval_alpha_squared*x50; 
+      const auto x80 = radial_eval_alpha*x38 + radial_eval_alpha_squared*x39; 
+      const auto x81 = radial_eval_alpha_squared*x45 + x77; 
+      const auto x82 = x27*z; 
+      const auto x83 = x30*z*(radial_eval_alpha_squared*x15 + x76); 
+      const auto x84 = radial_eval_alpha_squared*x58 - x12*x76 + x36 + x57*x66; 
+      const auto x85 = x10*z; 
+      const auto x86 = 8.0*radial_eval_alpha; 
+      const auto x87 = x12*x71; 
+      const auto x88 = radial_eval_alpha_squared*x59; 
+      const auto x89 = x0*z; 
+      const auto x90 = radial_eval_alpha_squared*x4; 
+      const auto x91 = radial_eval_alpha + x90; 
+      const auto x92 = x6*x91 + x74; 
+      const auto x93 = x15*x91; 
+      const auto x94 = x49*x71 + x93; 
+      const auto x95 = x4*x63; 
+      const auto x96 = x28 + x95; 
+      const auto x97 = x4*x67; 
+      const auto x98 = x31 + x97; 
+      const auto x99 = radial_eval_alpha_squared*x12; 
+      const auto x100 = radial_eval_alpha + x99; 
+      const auto x101 = x100*x6; 
+      const auto x102 = 16.0*radial_eval_alpha*x12 + x100*x15; 
+      const auto x103 = x102 + x55; 
+      const auto x104 = x100*x19 + x57*x76; 
+      const auto x105 = x23*(x100 + x71); 
+      const auto x106 = x100*x26; 
+      const auto x107 = -x95; 
+      const auto x108 = -x97; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z;
-      basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
-      basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      basis_eval[ipt + 0*npts] = radial_eval*x1*x6;
+      basis_eval[ipt + 1*npts] = radial_eval*x9;
+      basis_eval[ipt + 2*npts] = radial_eval*x11*x15;
+      basis_eval[ipt + 3*npts] = radial_eval*x16*x19;
+      basis_eval[ipt + 4*npts] = radial_eval*x15*x20;
+      basis_eval[ipt + 5*npts] = radial_eval*x22*x23;
+      basis_eval[ipt + 6*npts] = radial_eval*x24*x26;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x);
-      basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
+      basis_x_eval[ipt + 0*npts] = x27*(radial_eval_alpha*x6 + x28);
+      basis_x_eval[ipt + 1*npts] = x29*x8;
+      basis_x_eval[ipt + 2*npts] = x34;
+      basis_x_eval[ipt + 3*npts] = x35*x37;
+      basis_x_eval[ipt + 4*npts] = x10*(radial_eval*x38 + radial_eval_alpha*x39);
+      basis_x_eval[ipt + 5*npts] = x40*x42;
+      basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x45 + x44);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y);
-      basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
+      basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x46 + x44);
+      basis_y_eval[ipt + 1*npts] = x*x48*x7;
+      basis_y_eval[ipt + 2*npts] = x10*(radial_eval*x49 + radial_eval_alpha*x50);
+      basis_y_eval[ipt + 3*npts] = x16*x37*y;
+      basis_y_eval[ipt + 4*npts] = x34;
+      basis_y_eval[ipt + 5*npts] = x22*x51*y;
+      basis_y_eval[ipt + 6*npts] = x27*(radial_eval_alpha*x26 + x36);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4;
-      basis_z_eval[ipt + 1*npts] = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z);
-      basis_z_eval[ipt + 2*npts] = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 3*npts] = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2;
-      basis_z_eval[ipt + 4*npts] = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 5*npts] = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2;
-      basis_z_eval[ipt + 6*npts] = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4;
+      basis_z_eval[ipt + 0*npts] = x1*x52*x6;
+      basis_z_eval[ipt + 1*npts] = x*x53*(radial_eval + x54);
+      basis_z_eval[ipt + 2*npts] = x11*x56*z;
+      basis_z_eval[ipt + 3*npts] = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58;
+      basis_z_eval[ipt + 4*npts] = x20*x56*z;
+      basis_z_eval[ipt + 5*npts] = x21*(radial_eval*x23 + radial_eval_alpha*x59);
+      basis_z_eval[ipt + 6*npts] = x24*x26*x52;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = sqrt_10*y*(27*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*x*x + 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/4;
-      basis_lapl_eval[ipt + 1*npts] = sqrt_15*x*y*z*(9*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z);
-      basis_lapl_eval[ipt + 2*npts] = sqrt_6*y*(-9*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + 36*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y + 3*radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4;
-      basis_lapl_eval[ipt + 3*npts] = z*(-27*radial_eval_alpha*x*x - 27*radial_eval_alpha*y*y + 18*radial_eval_alpha*z*z - 3*radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z + 2*radial_eval_alpha_squared*z*z*z*z)/2;
-      basis_lapl_eval[ipt + 4*npts] = sqrt_6*x*(-9*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + 36*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y + 3*radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4;
-      basis_lapl_eval[ipt + 5*npts] = sqrt_15*z*(9*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/2;
-      basis_lapl_eval[ipt + 6*npts] = sqrt_10*x*(9*radial_eval_alpha*x*x - 27*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y - 3*radial_eval_alpha_squared*y*y*z*z)/4;
+      basis_lapl_eval[ipt + 0*npts] = x1*(x101 + x62 + x64 + x92);
+      basis_lapl_eval[ipt + 1*npts] = x9*(9.0*radial_eval_alpha + x60 + x90 + x99);
+      basis_lapl_eval[ipt + 2*npts] = x11*(x102 - x68 + x70 + x94);
+      basis_lapl_eval[ipt + 3*npts] = x16*(x104 + x107 + x19*x61 + x19*x91 - x64);
+      basis_lapl_eval[ipt + 4*npts] = x20*(x102 + x108 + x72 + x93);
+      basis_lapl_eval[ipt + 5*npts] = x22*(x105 + x108 + x23*x91 + x68 + x73);
+      basis_lapl_eval[ipt + 6*npts] = x24*(x106 + x107 + x26*x91 + x75);
+
 
 
 
@@ -166,18 +281,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4;
-      ang_eval_1 = sqrt_15*radial_eval*x*y*z;
-      ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2;
+      ang_eval_0 = radial_eval*x1*x6;
+      ang_eval_1 = radial_eval*x9;
+      ang_eval_2 = radial_eval*x11*x15;
+      ang_eval_3 = radial_eval*x16*x19;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4;
-      ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2;
-      ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4;
+      ang_eval_0 = radial_eval*x15*x20;
+      ang_eval_1 = radial_eval*x22*x23;
+      ang_eval_2 = radial_eval*x24*x26;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
@@ -188,18 +303,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      dang_eval_y_0 = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      dang_eval_z_0 = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4;
-      dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x);
-      dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y);
-      dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z);
-      dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4;
-      dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2;
-      dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2;
+      dang_eval_x_0 = x27*(radial_eval_alpha*x6 + x28);
+      dang_eval_y_0 = x0*(radial_eval_alpha*x46 + x44);
+      dang_eval_z_0 = x1*x52*x6;
+      dang_eval_x_1 = x29*x8;
+      dang_eval_y_1 = x*x48*x7;
+      dang_eval_z_1 = x*x53*(radial_eval + x54);
+      dang_eval_x_2 = x34;
+      dang_eval_y_2 = x10*(radial_eval*x49 + radial_eval_alpha*x50);
+      dang_eval_z_2 = x11*x56*z;
+      dang_eval_x_3 = x35*x37;
+      dang_eval_y_3 = x16*x37*y;
+      dang_eval_z_3 = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -213,15 +328,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4;
-      dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4;
-      dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2;
-      dang_eval_z_1 = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2;
-      dang_eval_x_2 = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      dang_eval_y_2 = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      dang_eval_z_2 = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4;
+      dang_eval_x_0 = x10*(radial_eval*x38 + radial_eval_alpha*x39);
+      dang_eval_y_0 = x34;
+      dang_eval_z_0 = x20*x56*z;
+      dang_eval_x_1 = x40*x42;
+      dang_eval_y_1 = x22*x51*y;
+      dang_eval_z_1 = x21*(radial_eval*x23 + radial_eval_alpha*x59);
+      dang_eval_x_2 = x0*(radial_eval_alpha*x45 + x44);
+      dang_eval_y_2 = x27*(radial_eval_alpha*x26 + x36);
+      dang_eval_z_2 = x24*x26*x52;
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp
index a7a11723..1f48ecb0 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
     auto* __restrict__ basis_eval = task->bf + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -93,18 +96,37 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       }
 
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.5*radial_eval*x*y; 
+      const auto x1 = x*x; 
+      const auto x2 = y*y; 
+      const auto x3 = -x2; 
+      const auto x4 = 0.25*radial_eval; 
+      const auto x5 = x4*z; 
+      const auto x6 = x5*y; 
+      const auto x7 = 3.0*x1; 
+      const auto x8 = z*z; 
+      const auto x9 = 3.0*x2; 
+      const auto x10 = -x7 + 4.0*x8 - x9; 
+      const auto x11 = 0.125*radial_eval; 
+      const auto x12 = x*x*x*x; 
+      const auto x13 = y*y*y*y; 
+      const auto x14 = 6.0*x1*x2; 
+      const auto x15 = x1*x8; 
+      const auto x16 = x2*x8; 
+      const auto x17 = x*x5; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
-      basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_eval[ipt + 0*npts] = sqrt_35*x0*(x1 + x3);
+      basis_eval[ipt + 1*npts] = sqrt_70*x6*(x3 + x7);
+      basis_eval[ipt + 2*npts] = -sqrt_5*x0*(x1 + x2 - 6.0*x8);
+      basis_eval[ipt + 3*npts] = sqrt_10*x10*x6;
+      basis_eval[ipt + 4*npts] = x11*(3.0*x12 + 3.0*x13 + x14 - 24.0*x15 - 24.0*x16 + 8.0*(z*z*z*z));
+      basis_eval[ipt + 5*npts] = sqrt_10*x10*x17;
+      basis_eval[ipt + 6*npts] = -sqrt_5*x4*(x12 - x13 - 6.0*x15 + 6.0*x16);
+      basis_eval[ipt + 7*npts] = sqrt_70*x17*(x1 - x9);
+      basis_eval[ipt + 8*npts] = sqrt_35*x11*(x12 + x13 - x14);
 
 
     
@@ -113,6 +135,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
 
 
+
+
 #if 0
       // Evaluate the angular part of bfn
 
@@ -124,25 +148,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
+      ang_eval_0 = sqrt_35*x0*(x1 + x3);
+      ang_eval_1 = sqrt_70*x6*(x3 + x7);
+      ang_eval_2 = -sqrt_5*x0*(x1 + x2 - 6.0*x8);
+      ang_eval_3 = sqrt_10*x10*x6;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
+      ang_eval_0 = x11*(3.0*x12 + 3.0*x13 + x14 - 24.0*x15 - 24.0*x16 + 8.0*(z*z*z*z));
+      ang_eval_1 = sqrt_10*x10*x17;
+      ang_eval_2 = -sqrt_5*x4*(x12 - x13 - 6.0*x15 + 6.0*x16);
+      ang_eval_3 = sqrt_70*x17*(x1 - x9);
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      ang_eval_0 = sqrt_35*x11*(x12 + x13 - x14);
       basis_eval[ipt + 8*npts] = ang_eval_0;
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp
index 096c3db5..c826b10f 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_4(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_gradient_4(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 
-
     // Loop over points in task
     // Assign each point to separate thread within the warp
     #pragma unroll 1
@@ -99,53 +102,113 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
 
       radial_eval_alpha *= -2;
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.5*y; 
+      const auto x1 = sqrt_35*x0; 
+      const auto x2 = radial_eval*x; 
+      const auto x3 = x*x; 
+      const auto x4 = y*y; 
+      const auto x5 = -x4; 
+      const auto x6 = x3 + x5; 
+      const auto x7 = 0.25*z; 
+      const auto x8 = sqrt_70*x7; 
+      const auto x9 = radial_eval*y; 
+      const auto x10 = 3.0*x3; 
+      const auto x11 = x10 + x5; 
+      const auto x12 = sqrt_5*x0; 
+      const auto x13 = z*z; 
+      const auto x14 = -6.0*x13; 
+      const auto x15 = x14 + x4; 
+      const auto x16 = -x15 - x3; 
+      const auto x17 = sqrt_10*x7; 
+      const auto x18 = -4.0*x13; 
+      const auto x19 = 3.0*x4; 
+      const auto x20 = x18 + x19; 
+      const auto x21 = -x10 - x20; 
+      const auto x22 = 0.125*radial_eval; 
+      const auto x23 = x*x*x*x; 
+      const auto x24 = y*y*y*y; 
+      const auto x25 = 6.0*x3*x4; 
+      const auto x26 = x13*x3; 
+      const auto x27 = x13*x4; 
+      const auto x28 = 3.0*x23 + 3.0*x24 + x25 - 24.0*x26 - 24.0*x27 + 8.0*(z*z*z*z); 
+      const auto x29 = 0.25*sqrt_5; 
+      const auto x30 = -x23 + x24 + 6.0*x26 - 6.0*x27; 
+      const auto x31 = -x19; 
+      const auto x32 = x3 + x31; 
+      const auto x33 = x23 + x24 - x25; 
+      const auto x34 = radial_eval*x11; 
+      const auto x35 = x*y; 
+      const auto x36 = x35*x8; 
+      const auto x37 = 6.0*radial_eval; 
+      const auto x38 = -x37; 
+      const auto x39 = x17*x35*(radial_eval_alpha*x21 + x38); 
+      const auto x40 = 12.0*radial_eval; 
+      const auto x41 = x*x*x; 
+      const auto x42 = radial_eval_alpha*x; 
+      const auto x43 = 4.0*radial_eval; 
+      const auto x44 = 3.0*x; 
+      const auto x45 = radial_eval*(x10 + x31); 
+      const auto x46 = 0.125*sqrt_35; 
+      const auto x47 = 0.5*x; 
+      const auto x48 = radial_eval*x32; 
+      const auto x49 = y*y*y; 
+      const auto x50 = radial_eval_alpha*y; 
+      const auto x51 = 3.0*y; 
+      const auto x52 = 0.25*y; 
+      const auto x53 = -radial_eval*(x10 - 12.0*x13 + x19) + radial_eval_alpha*x13*x21; 
+      const auto x54 = 3.0*z; 
+      const auto x55 = radial_eval_alpha*z; 
+      const auto x56 = 0.25*x; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
-      basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_eval[ipt + 0*npts] = x1*x2*x6;
+      basis_eval[ipt + 1*npts] = x11*x8*x9;
+      basis_eval[ipt + 2*npts] = x12*x16*x2;
+      basis_eval[ipt + 3*npts] = x17*x21*x9;
+      basis_eval[ipt + 4*npts] = x22*x28;
+      basis_eval[ipt + 5*npts] = x17*x2*x21;
+      basis_eval[ipt + 6*npts] = radial_eval*x29*x30;
+      basis_eval[ipt + 7*npts] = x2*x32*x8;
+      basis_eval[ipt + 8*npts] = sqrt_35*x22*x33;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2;
-      basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2;
-      basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x3*x6 + x34);
+      basis_x_eval[ipt + 1*npts] = x36*(radial_eval_alpha*x11 + x37);
+      basis_x_eval[ipt + 2*npts] = -x12*(radial_eval*(x10 + x15) - radial_eval_alpha*x16*x3);
+      basis_x_eval[ipt + 3*npts] = x39;
+      basis_x_eval[ipt + 4*npts] = 0.125*x28*x42 + 0.125*x40*(-4.0*x*x13 + x*x4 + x41);
+      basis_x_eval[ipt + 5*npts] = -x17*(radial_eval*(x20 + 9.0*x3) - radial_eval_alpha*x21*x3);
+      basis_x_eval[ipt + 6*npts] = x29*(x30*x42 + x43*(x13*x44 - x41));
+      basis_x_eval[ipt + 7*npts] = x8*(radial_eval_alpha*x3*x32 + x45);
+      basis_x_eval[ipt + 8*npts] = x46*(x33*x42 - x43*(x4*x44 - x41));
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2;
-      basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2;
-      basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_y_eval[ipt + 0*npts] = sqrt_35*x47*(radial_eval_alpha*x4*x6 + x48);
+      basis_y_eval[ipt + 1*npts] = x8*(radial_eval_alpha*x11*x4 + x45);
+      basis_y_eval[ipt + 2*npts] = -sqrt_5*x47*(radial_eval*(x14 + x19 + x3) - radial_eval_alpha*x16*x4);
+      basis_y_eval[ipt + 3*npts] = -x17*(radial_eval*(x10 + x18 + 9.0*x4) - radial_eval_alpha*x21*x4);
+      basis_y_eval[ipt + 4*npts] = 0.125*x28*x50 + 0.125*x40*(-4.0*x13*y + x3*y + x49);
+      basis_y_eval[ipt + 5*npts] = x39;
+      basis_y_eval[ipt + 6*npts] = x29*(x30*x50 - x43*(x13*x51 - x49));
+      basis_y_eval[ipt + 7*npts] = x36*(radial_eval_alpha*x32 + x38);
+      basis_y_eval[ipt + 8*npts] = x46*(x33*x50 - x43*(x3*x51 - x49));
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2;
-      basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4;
-      basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2;
-      basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4;
-      basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_z_eval[ipt + 0*npts] = x1*x42*x6*z;
+      basis_z_eval[ipt + 1*npts] = sqrt_70*x52*(radial_eval_alpha*x11*x13 + x34);
+      basis_z_eval[ipt + 2*npts] = x*x12*z*(radial_eval_alpha*x16 + x40);
+      basis_z_eval[ipt + 3*npts] = sqrt_10*x52*x53;
+      basis_z_eval[ipt + 4*npts] = -2.0*radial_eval*(x3*x54 + x4*x54 - 2.0*z*z*z) + 0.125*x28*x55;
+      basis_z_eval[ipt + 5*npts] = sqrt_10*x53*x56;
+      basis_z_eval[ipt + 6*npts] = x29*z*(radial_eval_alpha*x30 + x40*x6);
+      basis_z_eval[ipt + 7*npts] = sqrt_70*x56*(radial_eval_alpha*x13*x32 + x48);
+      basis_z_eval[ipt + 8*npts] = x33*x46*x55;
+
+
 
 
 
@@ -162,25 +225,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
+      ang_eval_0 = x1*x2*x6;
+      ang_eval_1 = x11*x8*x9;
+      ang_eval_2 = x12*x16*x2;
+      ang_eval_3 = x17*x21*x9;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
+      ang_eval_0 = x22*x28;
+      ang_eval_1 = x17*x2*x21;
+      ang_eval_2 = radial_eval*x29*x30;
+      ang_eval_3 = x2*x32*x8;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      ang_eval_0 = sqrt_35*x22*x33;
       basis_eval[ipt + 8*npts] = ang_eval_0;
 
 
@@ -189,18 +252,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2;
-      dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2;
-      dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2;
-      dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4;
-      dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2;
-      dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2;
-      dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2;
-      dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
+      dang_eval_x_0 = x1*(radial_eval_alpha*x3*x6 + x34);
+      dang_eval_y_0 = sqrt_35*x47*(radial_eval_alpha*x4*x6 + x48);
+      dang_eval_z_0 = x1*x42*x6*z;
+      dang_eval_x_1 = x36*(radial_eval_alpha*x11 + x37);
+      dang_eval_y_1 = x8*(radial_eval_alpha*x11*x4 + x45);
+      dang_eval_z_1 = sqrt_70*x52*(radial_eval_alpha*x11*x13 + x34);
+      dang_eval_x_2 = -x12*(radial_eval*(x10 + x15) - radial_eval_alpha*x16*x3);
+      dang_eval_y_2 = -sqrt_5*x47*(radial_eval*(x14 + x19 + x3) - radial_eval_alpha*x16*x4);
+      dang_eval_z_2 = x*x12*z*(radial_eval_alpha*x16 + x40);
+      dang_eval_x_3 = x39;
+      dang_eval_y_3 = -x17*(radial_eval*(x10 + x18 + 9.0*x4) - radial_eval_alpha*x21*x4);
+      dang_eval_z_3 = sqrt_10*x52*x53;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -214,18 +277,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4;
+      dang_eval_x_0 = 0.125*x28*x42 + 0.125*x40*(-4.0*x*x13 + x*x4 + x41);
+      dang_eval_y_0 = 0.125*x28*x50 + 0.125*x40*(-4.0*x13*y + x3*y + x49);
+      dang_eval_z_0 = -2.0*radial_eval*(x3*x54 + x4*x54 - 2.0*z*z*z) + 0.125*x28*x55;
+      dang_eval_x_1 = -x17*(radial_eval*(x20 + 9.0*x3) - radial_eval_alpha*x21*x3);
+      dang_eval_y_1 = x39;
+      dang_eval_z_1 = sqrt_10*x53*x56;
+      dang_eval_x_2 = x29*(x30*x42 + x43*(x13*x44 - x41));
+      dang_eval_y_2 = x29*(x30*x50 - x43*(x13*x51 - x49));
+      dang_eval_z_2 = x29*z*(radial_eval_alpha*x30 + x40*x6);
+      dang_eval_x_3 = x8*(radial_eval_alpha*x3*x32 + x45);
+      dang_eval_y_3 = x36*(radial_eval_alpha*x32 + x38);
+      dang_eval_z_3 = sqrt_70*x56*(radial_eval_alpha*x13*x32 + x48);
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -239,9 +302,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
-      dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
-      dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      dang_eval_x_0 = x46*(x33*x42 - x43*(x4*x44 - x41));
+      dang_eval_y_0 = x46*(x33*x50 - x43*(x3*x51 - x49));
+      dang_eval_z_0 = x33*x46*x55;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp
index bb3845ed..38db396f 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_4(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_4(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
     auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
@@ -108,119 +111,298 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.5*sqrt_35; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x1; 
+      const auto x3 = x*x; 
+      const auto x4 = x3; 
+      const auto x5 = y*y; 
+      const auto x6 = x5; 
+      const auto x7 = -x6; 
+      const auto x8 = x4 + x7; 
+      const auto x9 = 0.25*sqrt_70; 
+      const auto x10 = x9*z; 
+      const auto x11 = x10*y; 
+      const auto x12 = 3.0*x4; 
+      const auto x13 = x12 + x7; 
+      const auto x14 = 0.5*sqrt_5; 
+      const auto x15 = x14*y; 
+      const auto x16 = x*x15; 
+      const auto x17 = z*z; 
+      const auto x18 = x17; 
+      const auto x19 = -6.0*x18; 
+      const auto x20 = x19 + x6; 
+      const auto x21 = -x20 - x4; 
+      const auto x22 = 0.25*sqrt_10; 
+      const auto x23 = x22*z; 
+      const auto x24 = x23*y; 
+      const auto x25 = -4.0*x18; 
+      const auto x26 = 3.0*x6; 
+      const auto x27 = x25 + x26; 
+      const auto x28 = -x12 - x27; 
+      const auto x29 = 0.125*radial_eval; 
+      const auto x30 = x*x*x*x; 
+      const auto x31 = y*y*y*y; 
+      const auto x32 = 6.0*x4*x6; 
+      const auto x33 = x18*x4; 
+      const auto x34 = x18*x6; 
+      const auto x35 = 3.0*x30 + 3.0*x31 + x32 - 24.0*x33 - 24.0*x34 + 8.0*(z*z*z*z); 
+      const auto x36 = x*x23; 
+      const auto x37 = 0.25*sqrt_5; 
+      const auto x38 = -x30 + x31 + 6.0*x33 - 6.0*x34; 
+      const auto x39 = x*x10; 
+      const auto x40 = -x26; 
+      const auto x41 = x4 + x40; 
+      const auto x42 = x30 + x31 - x32; 
+      const auto x43 = radial_eval*x13; 
+      const auto x44 = x4*x8; 
+      const auto x45 = x*x11; 
+      const auto x46 = 6.0*radial_eval; 
+      const auto x47 = radial_eval_alpha*x13; 
+      const auto x48 = x46 + x47; 
+      const auto x49 = -x12 - x20; 
+      const auto x50 = x21*x4; 
+      const auto x51 = -x46; 
+      const auto x52 = x*x24*(radial_eval_alpha*x28 + x51); 
+      const auto x53 = 12.0*radial_eval; 
+      const auto x54 = x*x*x; 
+      const auto x55 = 4.0*x; 
+      const auto x56 = x*x6 - x18*x55 + x54; 
+      const auto x57 = radial_eval_alpha*x; 
+      const auto x58 = 9.0*x4; 
+      const auto x59 = -x27 - x58; 
+      const auto x60 = x28*x4; 
+      const auto x61 = 4.0*radial_eval; 
+      const auto x62 = 3.0*x; 
+      const auto x63 = x18*x62 - x54; 
+      const auto x64 = x12 + x40; 
+      const auto x65 = radial_eval*x64; 
+      const auto x66 = x4*x41; 
+      const auto x67 = radial_eval_alpha*x66 + x65; 
+      const auto x68 = 0.125*sqrt_35; 
+      const auto x69 = x54 - x6*x62; 
+      const auto x70 = x*x0; 
+      const auto x71 = radial_eval*x41; 
+      const auto x72 = x6*x8; 
+      const auto x73 = x13*x6; 
+      const auto x74 = radial_eval_alpha*x73 + x65; 
+      const auto x75 = x*x14; 
+      const auto x76 = x19 + x26; 
+      const auto x77 = -x4 - x76; 
+      const auto x78 = x21*x6; 
+      const auto x79 = 9.0*x6; 
+      const auto x80 = x12 + x25; 
+      const auto x81 = -x79 - x80; 
+      const auto x82 = x28*x6; 
+      const auto x83 = y*y*y; 
+      const auto x84 = 4.0*y; 
+      const auto x85 = -x18*x84 + x4*y + x83; 
+      const auto x86 = radial_eval_alpha*y; 
+      const auto x87 = 3.0*y; 
+      const auto x88 = -x18*x87 + x83; 
+      const auto x89 = radial_eval_alpha*x41; 
+      const auto x90 = x51 + x89; 
+      const auto x91 = -x4*x87 + x83; 
+      const auto x92 = x1*z; 
+      const auto x93 = x9*y; 
+      const auto x94 = x13*x18; 
+      const auto x95 = x22*y; 
+      const auto x96 = -12.0*x18; 
+      const auto x97 = x26 + x96; 
+      const auto x98 = -x12 - x97; 
+      const auto x99 = x18*x28; 
+      const auto x100 = radial_eval*x98 + radial_eval_alpha*x99; 
+      const auto x101 = 3.0*z; 
+      const auto x102 = -x101*x4 - x101*x6 + 2.0*(z*z*z); 
+      const auto x103 = radial_eval_alpha*z; 
+      const auto x104 = x37*z; 
+      const auto x105 = x53*x8; 
+      const auto x106 = x18*x41; 
+      const auto x107 = 2.0*radial_eval_alpha; 
+      const auto x108 = x107*x13; 
+      const auto x109 = radial_eval_alpha + radial_eval_alpha_squared*x4; 
+      const auto x110 = x108 + x109*x8; 
+      const auto x111 = x109*x13; 
+      const auto x112 = 12.0*radial_eval_alpha; 
+      const auto x113 = x112*x4; 
+      const auto x114 = x113 + x46; 
+      const auto x115 = x107*x49 + x109*x21; 
+      const auto x116 = x109*x35 + x53*(x6 + x80) + 24.0*x56*x57; 
+      const auto x117 = -18.0*radial_eval; 
+      const auto x118 = x109*x28; 
+      const auto x119 = x107*x59 + x118; 
+      const auto x120 = -x4; 
+      const auto x121 = 8.0*x57; 
+      const auto x122 = x109*x38 + x121*x63 + x53*(x120 + x18); 
+      const auto x123 = x107*x64; 
+      const auto x124 = x109*x41 + x123; 
+      const auto x125 = x105 + x109*x42 + x121*x69; 
+      const auto x126 = radial_eval_alpha*x3; 
+      const auto x127 = radial_eval_alpha*x5; 
+      const auto x128 = 6.0*radial_eval_alpha; 
+      const auto x129 = x128*x6; 
+      const auto x130 = radial_eval_alpha*x64; 
+      const auto x131 = 24.0*radial_eval; 
+      const auto x132 = x*x131; 
+      const auto x133 = x132*y; 
+      const auto x134 = 12.0*x57; 
+      const auto x135 = 12.0*x86; 
+      const auto x136 = radial_eval_alpha_squared*x; 
+      const auto x137 = x136*y; 
+      const auto x138 = -x128*x4 + x51; 
+      const auto x139 = radial_eval_alpha*x55; 
+      const auto x140 = radial_eval_alpha*x84; 
+      const auto x141 = x*x93; 
+      const auto x142 = x128*x18; 
+      const auto x143 = -x142; 
+      const auto x144 = x*x95*(radial_eval_alpha*x98 + radial_eval_alpha_squared*x99 + x143 + x51); 
+      const auto x145 = 96.0*radial_eval*z; 
+      const auto x146 = 12.0*x103; 
+      const auto x147 = radial_eval_alpha*x17; 
+      const auto x148 = 4.0*radial_eval_alpha; 
+      const auto x149 = x147*x64; 
+      const auto x150 = x68*z; 
+      const auto x151 = x107*x41; 
+      const auto x152 = radial_eval_alpha + radial_eval_alpha_squared*x6; 
+      const auto x153 = x151 + x152*x8; 
+      const auto x154 = x123 + x13*x152; 
+      const auto x155 = x107*x77 + x152*x21; 
+      const auto x156 = x152*x28; 
+      const auto x157 = x107*x81 + x156; 
+      const auto x158 = x152*x35 + x53*(x27 + x4) + 24.0*x85*x86; 
+      const auto x159 = x112*x6; 
+      const auto x160 = x159 + x46; 
+      const auto x161 = 8.0*x86; 
+      const auto x162 = x152*x38 + x161*x88 - x53*(x18 - x6); 
+      const auto x163 = x152*x42 + x161*x91 + x53*(x120 + x6); 
+      const auto x164 = radial_eval_alpha_squared*y; 
+      const auto x165 = radial_eval_alpha + radial_eval_alpha_squared*x18; 
+      const auto x166 = x165*x8; 
+      const auto x167 = x108 + x13*x165; 
+      const auto x168 = 24.0*radial_eval_alpha*x18 + x165*x21; 
+      const auto x169 = x107*x98 + x165*x28; 
+      const auto x170 = x131 + x169; 
+      const auto x171 = -48.0*radial_eval*(-2.0*x18 + x4 + x6) + 32.0*x102*x103 + x165*x35; 
+      const auto x172 = x105 + 24.0*x147*x8 + x165*x38; 
+      const auto x173 = x151 + x165*x41; 
+      const auto x174 = x165*x42; 
+      const auto x175 = -x159; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
-      basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_eval[ipt + 0*npts] = radial_eval*x2*x8;
+      basis_eval[ipt + 1*npts] = radial_eval*x11*x13;
+      basis_eval[ipt + 2*npts] = radial_eval*x16*x21;
+      basis_eval[ipt + 3*npts] = radial_eval*x24*x28;
+      basis_eval[ipt + 4*npts] = x29*x35;
+      basis_eval[ipt + 5*npts] = radial_eval*x28*x36;
+      basis_eval[ipt + 6*npts] = radial_eval*x37*x38;
+      basis_eval[ipt + 7*npts] = radial_eval*x39*x41;
+      basis_eval[ipt + 8*npts] = sqrt_35*x29*x42;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2;
-      basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2;
-      basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x44 + x43);
+      basis_x_eval[ipt + 1*npts] = x45*x48;
+      basis_x_eval[ipt + 2*npts] = x15*(radial_eval*x49 + radial_eval_alpha*x50);
+      basis_x_eval[ipt + 3*npts] = x52;
+      basis_x_eval[ipt + 4*npts] = 0.125*x35*x57 + 0.125*x53*x56;
+      basis_x_eval[ipt + 5*npts] = x23*(radial_eval*x59 + radial_eval_alpha*x60);
+      basis_x_eval[ipt + 6*npts] = x37*(x38*x57 + x61*x63);
+      basis_x_eval[ipt + 7*npts] = x10*x67;
+      basis_x_eval[ipt + 8*npts] = x68*(x42*x57 + x61*x69);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2;
-      basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2;
-      basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_y_eval[ipt + 0*npts] = x70*(radial_eval_alpha*x72 + x71);
+      basis_y_eval[ipt + 1*npts] = x10*x74;
+      basis_y_eval[ipt + 2*npts] = x75*(radial_eval*x77 + radial_eval_alpha*x78);
+      basis_y_eval[ipt + 3*npts] = x23*(radial_eval*x81 + radial_eval_alpha*x82);
+      basis_y_eval[ipt + 4*npts] = 0.125*x35*x86 + 0.125*x53*x85;
+      basis_y_eval[ipt + 5*npts] = x52;
+      basis_y_eval[ipt + 6*npts] = x37*(x38*x86 + x61*x88);
+      basis_y_eval[ipt + 7*npts] = x45*x90;
+      basis_y_eval[ipt + 8*npts] = x68*(x42*x86 + x61*x91);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2;
-      basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4;
-      basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2;
-      basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4;
-      basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_z_eval[ipt + 0*npts] = x57*x8*x92;
+      basis_z_eval[ipt + 1*npts] = x93*(radial_eval_alpha*x94 + x43);
+      basis_z_eval[ipt + 2*npts] = x16*z*(radial_eval_alpha*x21 + x53);
+      basis_z_eval[ipt + 3*npts] = x100*x95;
+      basis_z_eval[ipt + 4*npts] = 2.0*radial_eval*x102 + 0.125*x103*x35;
+      basis_z_eval[ipt + 5*npts] = x*x100*x22;
+      basis_z_eval[ipt + 6*npts] = x104*(radial_eval_alpha*x38 + x105);
+      basis_z_eval[ipt + 7*npts] = x*x9*(radial_eval_alpha*x106 + x71);
+      basis_z_eval[ipt + 8*npts] = x103*x42*x68;
 
       // Evaluate second derivative of bfn wrt xx
-      basis_xx_eval[ipt + 0*npts] = sqrt_35*x*y*(6*radial_eval + 2*radial_eval_alpha*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2;
-      basis_xx_eval[ipt + 1*npts] = sqrt_70*y*z*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4;
-      basis_xx_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 6*z*z))/2;
-      basis_xx_eval[ipt + 3*npts] = sqrt_10*y*z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_xx_eval[ipt + 4*npts] = 3*radial_eval*(3*x*x + y*y - 4*z*z)/2 + 3*radial_eval_alpha*x*x*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_xx_eval[ipt + 5*npts] = sqrt_10*x*z*(-18*radial_eval - 2*radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_xx_eval[ipt + 6*npts] = sqrt_5*(-12*radial_eval*(x*x - z*z) - 8*radial_eval_alpha*x*x*(x*x - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_xx_eval[ipt + 7*npts] = sqrt_70*x*z*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4;
-      basis_xx_eval[ipt + 8*npts] = sqrt_35*(12*radial_eval*(x*x - y*y) + 8*radial_eval_alpha*x*x*(x*x - 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_xx_eval[ipt + 0*npts] = x2*(x110 + x46);
+      basis_xx_eval[ipt + 1*npts] = x11*(x111 + x114);
+      basis_xx_eval[ipt + 2*npts] = x16*(x115 + x51);
+      basis_xx_eval[ipt + 3*npts] = x24*(x109*x28 - x114);
+      basis_xx_eval[ipt + 4*npts] = 0.125*x116;
+      basis_xx_eval[ipt + 5*npts] = x36*(x117 + x119);
+      basis_xx_eval[ipt + 6*npts] = x122*x37;
+      basis_xx_eval[ipt + 7*npts] = x39*(x124 + x46);
+      basis_xx_eval[ipt + 8*npts] = x125*x68;
 
       // Evaluate second derivative of bfn wrt xy
-      basis_xy_eval[ipt + 0*npts] = sqrt_35*(3*radial_eval*x*x - 3*radial_eval*y*y + radial_eval_alpha*x*x*x*x - radial_eval_alpha*y*y*y*y + radial_eval_alpha_squared*x*x*x*x*y*y - radial_eval_alpha_squared*x*x*y*y*y*y)/2;
-      basis_xy_eval[ipt + 1*npts] = sqrt_70*x*z*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4;
-      basis_xy_eval[ipt + 2*npts] = sqrt_5*(-3*radial_eval*(x*x + y*y - 2*z*z) - radial_eval_alpha*x*x*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*y*y*(x*x + y*y - 6*z*z))/2;
-      basis_xy_eval[ipt + 3*npts] = sqrt_10*x*z*(-6*radial_eval - 6*radial_eval_alpha*y*y - radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_xy_eval[ipt + 4*npts] = x*y*(24*radial_eval + 24*radial_eval_alpha*(x*x + y*y - 4*z*z) + radial_eval_alpha_squared*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_xy_eval[ipt + 5*npts] = sqrt_10*y*z*(-6*radial_eval - 6*radial_eval_alpha*x*x - radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_xy_eval[ipt + 6*npts] = sqrt_5*x*y*(-4*radial_eval_alpha*x*x + 4*radial_eval_alpha*y*y - radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*y*y - 6*radial_eval_alpha_squared*y*y*z*z)/4;
-      basis_xy_eval[ipt + 7*npts] = sqrt_70*y*z*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4;
-      basis_xy_eval[ipt + 8*npts] = sqrt_35*x*y*(-24*radial_eval - 8*radial_eval_alpha*x*x - 8*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y)/8;
+      basis_xy_eval[ipt + 0*npts] = x0*(radial_eval_alpha_squared*x4*x6*x8 + x126*x41 + x127*x13 + x65);
+      basis_xy_eval[ipt + 1*npts] = x39*(radial_eval_alpha_squared*x73 + x129 + x130 + x46);
+      basis_xy_eval[ipt + 2*npts] = x14*(-radial_eval*(x12 + x76) + radial_eval_alpha_squared*x21*x4*x6 + x126*x77 + x127*x49);
+      basis_xy_eval[ipt + 3*npts] = x36*(radial_eval_alpha*x81 + radial_eval_alpha_squared*x82 - x129 + x51);
+      basis_xy_eval[ipt + 4*npts] = 0.125*x133 + 0.125*x134*x85 + 0.125*x135*x56 + 0.125*x137*x35;
+      basis_xy_eval[ipt + 5*npts] = x24*(radial_eval_alpha*x59 + radial_eval_alpha_squared*x60 + x138);
+      basis_xy_eval[ipt + 6*npts] = x37*(x137*x38 + x139*x88 + x140*x63);
+      basis_xy_eval[ipt + 7*npts] = x11*(radial_eval_alpha_squared*x66 + x130 + x138);
+      basis_xy_eval[ipt + 8*npts] = x68*(-x133 + x137*x42 + x139*x91 + x140*x69);
 
       // Evaluate second derivative of bfn wrt xz
-      basis_xz_eval[ipt + 0*npts] = sqrt_35*y*z*(radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*x*x*(x*x - y*y))/2;
-      basis_xz_eval[ipt + 1*npts] = sqrt_70*x*y*(6*radial_eval + 6*radial_eval_alpha*z*z + radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*z*z*(3*x*x - y*y))/4;
-      basis_xz_eval[ipt + 2*npts] = sqrt_5*y*z*(12*radial_eval + 12*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 6*z*z))/2;
-      basis_xz_eval[ipt + 3*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_xz_eval[ipt + 4*npts] = x*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8;
-      basis_xz_eval[ipt + 5*npts] = sqrt_10*(-3*radial_eval*(3*x*x + y*y - 4*z*z) + 3*radial_eval_alpha*x*x*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_xz_eval[ipt + 6*npts] = sqrt_5*x*z*(24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) - 4*radial_eval_alpha*(x*x - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_xz_eval[ipt + 7*npts] = sqrt_70*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y) + 3*radial_eval_alpha*z*z*(x*x - y*y) + radial_eval_alpha_squared*x*x*z*z*(x*x - 3*y*y))/4;
-      basis_xz_eval[ipt + 8*npts] = sqrt_35*x*z*(4*radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_xz_eval[ipt + 0*npts] = x92*(radial_eval_alpha_squared*x44 + x47);
+      basis_xz_eval[ipt + 1*npts] = x141*(radial_eval_alpha_squared*x94 + x142 + x48);
+      basis_xz_eval[ipt + 2*npts] = x15*z*(radial_eval_alpha*x49 + radial_eval_alpha_squared*x50 + x113 + x53);
+      basis_xz_eval[ipt + 3*npts] = x144;
+      basis_xz_eval[ipt + 4*npts] = -0.125*x*x145 + 2.0*x102*x57 + 0.125*x136*x35*z + 0.125*x146*x56;
+      basis_xz_eval[ipt + 5*npts] = x22*(-radial_eval*(x58 + x97) + radial_eval_alpha_squared*x18*x28*x4 + x126*x98 + x147*x59);
+      basis_xz_eval[ipt + 6*npts] = x104*(x132 + x134*x8 + x136*x38 + x148*x63);
+      basis_xz_eval[ipt + 7*npts] = x9*(radial_eval_alpha_squared*x18*x4*x41 + x149 + x67);
+      basis_xz_eval[ipt + 8*npts] = x150*(x136*x42 + x148*x69);
 
       // Evaluate second derivative of bfn wrt yy
-      basis_yy_eval[ipt + 0*npts] = sqrt_35*x*y*(-6*radial_eval - 2*radial_eval_alpha*(-x*x + 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2;
-      basis_yy_eval[ipt + 1*npts] = sqrt_70*y*z*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4;
-      basis_yy_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 6*z*z))/2;
-      basis_yy_eval[ipt + 3*npts] = sqrt_10*y*z*(-18*radial_eval - 2*radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_yy_eval[ipt + 4*npts] = 3*radial_eval*(x*x + 3*y*y - 4*z*z)/2 + 3*radial_eval_alpha*y*y*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_yy_eval[ipt + 5*npts] = sqrt_10*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_yy_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(y*y - z*z) + 8*radial_eval_alpha*y*y*(y*y - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_yy_eval[ipt + 7*npts] = sqrt_70*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4;
-      basis_yy_eval[ipt + 8*npts] = sqrt_35*(-12*radial_eval*(x*x - y*y) - 8*radial_eval_alpha*y*y*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_yy_eval[ipt + 0*npts] = x2*(x153 + x51);
+      basis_yy_eval[ipt + 1*npts] = x11*(x154 + x51);
+      basis_yy_eval[ipt + 2*npts] = x16*(x155 + x51);
+      basis_yy_eval[ipt + 3*npts] = x24*(x117 + x157);
+      basis_yy_eval[ipt + 4*npts] = 0.125*x158;
+      basis_yy_eval[ipt + 5*npts] = x36*(x152*x28 - x160);
+      basis_yy_eval[ipt + 6*npts] = x162*x37;
+      basis_yy_eval[ipt + 7*npts] = x39*(x152*x41 - x160);
+      basis_yy_eval[ipt + 8*npts] = x163*x68;
 
       // Evaluate second derivative of bfn wrt yz
-      basis_yz_eval[ipt + 0*npts] = sqrt_35*x*z*(-radial_eval_alpha*(-x*x + 3*y*y) + radial_eval_alpha_squared*y*y*(x*x - y*y))/2;
-      basis_yz_eval[ipt + 1*npts] = sqrt_70*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y) - 3*radial_eval_alpha*z*z*(-x*x + y*y) + radial_eval_alpha_squared*y*y*z*z*(3*x*x - y*y))/4;
-      basis_yz_eval[ipt + 2*npts] = sqrt_5*x*z*(12*radial_eval + 12*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 6*z*z))/2;
-      basis_yz_eval[ipt + 3*npts] = sqrt_10*(-3*radial_eval*(x*x + 3*y*y - 4*z*z) + 3*radial_eval_alpha*y*y*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_yz_eval[ipt + 4*npts] = y*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8;
-      basis_yz_eval[ipt + 5*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_yz_eval[ipt + 6*npts] = sqrt_5*y*z*(-24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) + 4*radial_eval_alpha*(y*y - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_yz_eval[ipt + 7*npts] = sqrt_70*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*z*z*(x*x - 3*y*y))/4;
-      basis_yz_eval[ipt + 8*npts] = sqrt_35*y*z*(-4*radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_yz_eval[ipt + 0*npts] = x70*z*(radial_eval_alpha_squared*x72 + x89);
+      basis_yz_eval[ipt + 1*npts] = x9*(radial_eval_alpha_squared*x13*x18*x6 + x149 + x74);
+      basis_yz_eval[ipt + 2*npts] = x75*z*(radial_eval_alpha*x77 + radial_eval_alpha_squared*x78 + x159 + x53);
+      basis_yz_eval[ipt + 3*npts] = x22*(-radial_eval*(x12 + x79 + x96) + radial_eval_alpha_squared*x18*x28*x6 + x127*x98 + x147*x81);
+      basis_yz_eval[ipt + 4*npts] = 2.0*x102*x86 - 0.125*x145*y + 0.125*x146*x85 + 0.125*x164*x35*z;
+      basis_yz_eval[ipt + 5*npts] = x144;
+      basis_yz_eval[ipt + 6*npts] = x104*(-x131*y + x135*x8 + x148*x88 + x164*x38);
+      basis_yz_eval[ipt + 7*npts] = x141*(radial_eval_alpha_squared*x106 + x143 + x90);
+      basis_yz_eval[ipt + 8*npts] = x150*(x148*x91 + x164*x42);
 
       // Evaluate second derivative of bfn wrt zz
-      basis_zz_eval[ipt + 0*npts] = sqrt_35*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2;
-      basis_zz_eval[ipt + 1*npts] = sqrt_70*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x - y*y)/4;
-      basis_zz_eval[ipt + 2*npts] = sqrt_5*x*y*(12*radial_eval + 24*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 6*z*z))/2;
-      basis_zz_eval[ipt + 3*npts] = sqrt_10*y*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_zz_eval[ipt + 4*npts] = -6*radial_eval*(x*x + y*y - 2*z*z) - 4*radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_zz_eval[ipt + 5*npts] = sqrt_10*x*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_zz_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(x*x - y*y) + 24*radial_eval_alpha*z*z*(x*x - y*y) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_zz_eval[ipt + 7*npts] = sqrt_70*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - 3*y*y)/4;
-      basis_zz_eval[ipt + 8*npts] = sqrt_35*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_zz_eval[ipt + 0*npts] = x166*x2;
+      basis_zz_eval[ipt + 1*npts] = x11*x167;
+      basis_zz_eval[ipt + 2*npts] = x16*(x168 + x53);
+      basis_zz_eval[ipt + 3*npts] = x170*x24;
+      basis_zz_eval[ipt + 4*npts] = 0.125*x171;
+      basis_zz_eval[ipt + 5*npts] = x170*x36;
+      basis_zz_eval[ipt + 6*npts] = x172*x37;
+      basis_zz_eval[ipt + 7*npts] = x173*x39;
+      basis_zz_eval[ipt + 8*npts] = x174*x68;
+
+
 
 
 
@@ -236,25 +418,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
+      ang_eval_0 = radial_eval*x2*x8;
+      ang_eval_1 = radial_eval*x11*x13;
+      ang_eval_2 = radial_eval*x16*x21;
+      ang_eval_3 = radial_eval*x24*x28;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
+      ang_eval_0 = x29*x35;
+      ang_eval_1 = radial_eval*x28*x36;
+      ang_eval_2 = radial_eval*x37*x38;
+      ang_eval_3 = radial_eval*x39*x41;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      ang_eval_0 = sqrt_35*x29*x42;
       basis_eval[ipt + 8*npts] = ang_eval_0;
 
 
@@ -263,18 +445,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2;
-      dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2;
-      dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2;
-      dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4;
-      dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2;
-      dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2;
-      dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2;
-      dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
+      dang_eval_x_0 = x1*(radial_eval_alpha*x44 + x43);
+      dang_eval_y_0 = x70*(radial_eval_alpha*x72 + x71);
+      dang_eval_z_0 = x57*x8*x92;
+      dang_eval_x_1 = x45*x48;
+      dang_eval_y_1 = x10*x74;
+      dang_eval_z_1 = x93*(radial_eval_alpha*x94 + x43);
+      dang_eval_x_2 = x15*(radial_eval*x49 + radial_eval_alpha*x50);
+      dang_eval_y_2 = x75*(radial_eval*x77 + radial_eval_alpha*x78);
+      dang_eval_z_2 = x16*z*(radial_eval_alpha*x21 + x53);
+      dang_eval_x_3 = x52;
+      dang_eval_y_3 = x23*(radial_eval*x81 + radial_eval_alpha*x82);
+      dang_eval_z_3 = x100*x95;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -288,18 +470,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4;
+      dang_eval_x_0 = 0.125*x35*x57 + 0.125*x53*x56;
+      dang_eval_y_0 = 0.125*x35*x86 + 0.125*x53*x85;
+      dang_eval_z_0 = 2.0*radial_eval*x102 + 0.125*x103*x35;
+      dang_eval_x_1 = x23*(radial_eval*x59 + radial_eval_alpha*x60);
+      dang_eval_y_1 = x52;
+      dang_eval_z_1 = x*x100*x22;
+      dang_eval_x_2 = x37*(x38*x57 + x61*x63);
+      dang_eval_y_2 = x37*(x38*x86 + x61*x88);
+      dang_eval_z_2 = x104*(radial_eval_alpha*x38 + x105);
+      dang_eval_x_3 = x10*x67;
+      dang_eval_y_3 = x45*x90;
+      dang_eval_z_3 = x*x9*(radial_eval_alpha*x106 + x71);
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -313,9 +495,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
-      dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
-      dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      dang_eval_x_0 = x68*(x42*x57 + x61*x69);
+      dang_eval_y_0 = x68*(x42*x86 + x61*x91);
+      dang_eval_z_0 = x103*x42*x68;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp
new file mode 100644
index 00000000..b895836c
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp
@@ -0,0 +1,663 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "collocation_device_constants.hpp"
+#include "device/xc_device_task.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+#include "device/common/shell_to_task.hpp"
+#include <cassert>
+
+namespace GauXC {
+
+
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_4(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks
+) {
+
+
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
+  double* my_alpha = alpha[threadIdx.x/32];
+  double* my_coeff = coeff[threadIdx.x/32];
+
+  for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+  const uint32_t ntasks      = shell_to_task[ish].ntask;
+  const auto shell           = shell_to_task[ish].shell_device;
+  const auto task_idx        = shell_to_task[ish].task_idx_device;
+  const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+
+
+  // Load Shell Data into registers / SM
+  const uint32_t nprim = shell->nprim();
+  const double3 O  = *reinterpret_cast<const double3*>(shell->O_data());
+
+  const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+  const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+  // Read in coeffs/exps into SM on first warp
+  {
+    auto* coeff_gm = shell->coeff_data();
+    auto* alpha_gm = shell->alpha_data();
+    static_assert( detail::shell_nprim_max == cuda::warp_size );
+    const int warp_rank = threadIdx.x % cuda::warp_size;
+    my_alpha[warp_rank] = alpha_gm[warp_rank];
+    my_coeff[warp_rank] = coeff_gm[warp_rank];
+  }
+
+  // Loop over tasks assigned to shells
+  // Place each task on a different warp + schedule across blocks
+  for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+
+    const auto*              task   = device_tasks + task_idx[itask];
+    const auto* __restrict__ points_x = task->points_x;
+    const auto* __restrict__ points_y = task->points_y;
+    const auto* __restrict__ points_z = task->points_z;
+    const uint32_t           npts   = task->npts;
+    const size_t             shoff  = task_shell_offs[itask] * npts;
+
+    auto* __restrict__ basis_eval = task->bf + shoff;
+    auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+    auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+    auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+    auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+    auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+    auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+    auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+    auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+    auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+    auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+
+    // Loop over points in task
+    // Assign each point to separate thread within the warp
+    #pragma unroll 1
+    for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) {
+      //const double3 point = points[ipt];
+      double3 point;
+      point.x = points_x[ipt];
+      point.y = points_y[ipt];
+      point.z = points_z[ipt];
+
+
+      const auto x = point.x - O.x;
+      const auto y = point.y - O.y;
+      const auto z = point.z - O.z;
+      const auto rsq = x*x + y*y + z*z;
+
+      // Evaluate radial part of bfn
+      double radial_eval = 0.;
+      double radial_eval_alpha = 0.;
+      double radial_eval_alpha_squared = 0.;
+      double radial_eval_alpha_cubed = 0.;
+
+      #pragma unroll 1
+      for( uint32_t i = 0; i < nprim; ++i ) {
+        const auto a = my_alpha[i];
+        const auto e = my_coeff[i] * std::exp( - a * rsq );
+
+        radial_eval += e;
+        radial_eval_alpha += a * e;
+        radial_eval_alpha_squared += a * a * e;
+        radial_eval_alpha_cubed += a * a * a * e;
+      }
+
+      radial_eval_alpha *= -2;
+      radial_eval_alpha_squared *= 4;
+      radial_eval_alpha_cubed *= -8;
+
+      // Common Subexpressions
+      const auto x0 = 0.5*sqrt_35; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x1; 
+      const auto x3 = x*x; 
+      const auto x4 = x3; 
+      const auto x5 = y*y; 
+      const auto x6 = x5; 
+      const auto x7 = -x6; 
+      const auto x8 = x4 + x7; 
+      const auto x9 = 0.25*sqrt_70; 
+      const auto x10 = x9*z; 
+      const auto x11 = x10*y; 
+      const auto x12 = 3.0*x4; 
+      const auto x13 = x12 + x7; 
+      const auto x14 = 0.5*sqrt_5; 
+      const auto x15 = x14*y; 
+      const auto x16 = x*x15; 
+      const auto x17 = z*z; 
+      const auto x18 = x17; 
+      const auto x19 = -6.0*x18; 
+      const auto x20 = x19 + x6; 
+      const auto x21 = -x20 - x4; 
+      const auto x22 = 0.25*sqrt_10; 
+      const auto x23 = x22*z; 
+      const auto x24 = x23*y; 
+      const auto x25 = -4.0*x18; 
+      const auto x26 = 3.0*x6; 
+      const auto x27 = x25 + x26; 
+      const auto x28 = -x12 - x27; 
+      const auto x29 = 0.125*radial_eval; 
+      const auto x30 = x*x*x*x; 
+      const auto x31 = y*y*y*y; 
+      const auto x32 = x4*x6; 
+      const auto x33 = 6.0*x32; 
+      const auto x34 = x18*x4; 
+      const auto x35 = x18*x6; 
+      const auto x36 = 3.0*x30 + 3.0*x31 + x33 - 24.0*x34 - 24.0*x35 + 8.0*(z*z*z*z); 
+      const auto x37 = x*x23; 
+      const auto x38 = 0.25*sqrt_5; 
+      const auto x39 = -x30 + x31 + 6.0*x34 - 6.0*x35; 
+      const auto x40 = x*x10; 
+      const auto x41 = -x26; 
+      const auto x42 = x4 + x41; 
+      const auto x43 = x30 + x31 - x33; 
+      const auto x44 = radial_eval*x13; 
+      const auto x45 = x4*x8; 
+      const auto x46 = x*x11; 
+      const auto x47 = 6.0*radial_eval; 
+      const auto x48 = radial_eval_alpha*x13; 
+      const auto x49 = x47 + x48; 
+      const auto x50 = -x12 - x20; 
+      const auto x51 = x21*x4; 
+      const auto x52 = -x47; 
+      const auto x53 = x*x24*(radial_eval_alpha*x28 + x52); 
+      const auto x54 = 12.0*radial_eval; 
+      const auto x55 = x*x*x; 
+      const auto x56 = 4.0*x; 
+      const auto x57 = x*x6 - x18*x56 + x55; 
+      const auto x58 = radial_eval_alpha*x; 
+      const auto x59 = 9.0*x4; 
+      const auto x60 = -x27 - x59; 
+      const auto x61 = x28*x4; 
+      const auto x62 = 4.0*radial_eval; 
+      const auto x63 = 3.0*x; 
+      const auto x64 = x18*x63 - x55; 
+      const auto x65 = x12 + x41; 
+      const auto x66 = radial_eval*x65; 
+      const auto x67 = x4*x42; 
+      const auto x68 = radial_eval_alpha*x67 + x66; 
+      const auto x69 = 0.125*sqrt_35; 
+      const auto x70 = x55 - x6*x63; 
+      const auto x71 = x*x0; 
+      const auto x72 = radial_eval*x42; 
+      const auto x73 = x6*x8; 
+      const auto x74 = x13*x6; 
+      const auto x75 = radial_eval_alpha*x74 + x66; 
+      const auto x76 = x*x14; 
+      const auto x77 = x19 + x26; 
+      const auto x78 = -x4 - x77; 
+      const auto x79 = x21*x6; 
+      const auto x80 = 9.0*x6; 
+      const auto x81 = x12 + x25; 
+      const auto x82 = -x80 - x81; 
+      const auto x83 = x28*x6; 
+      const auto x84 = y*y*y; 
+      const auto x85 = 4.0*y; 
+      const auto x86 = -x18*x85 + x4*y + x84; 
+      const auto x87 = radial_eval_alpha*y; 
+      const auto x88 = 3.0*y; 
+      const auto x89 = -x18*x88 + x84; 
+      const auto x90 = radial_eval_alpha*x42; 
+      const auto x91 = x52 + x90; 
+      const auto x92 = -x4*x88 + x84; 
+      const auto x93 = x1*z; 
+      const auto x94 = x9*y; 
+      const auto x95 = x13*x18; 
+      const auto x96 = x22*y; 
+      const auto x97 = -12.0*x18; 
+      const auto x98 = x26 + x97; 
+      const auto x99 = -x12 - x98; 
+      const auto x100 = x18*x28; 
+      const auto x101 = radial_eval*x99 + radial_eval_alpha*x100; 
+      const auto x102 = z*z*z; 
+      const auto x103 = 3.0*z; 
+      const auto x104 = 2.0*x102 - x103*x4 - x103*x6; 
+      const auto x105 = radial_eval_alpha*z; 
+      const auto x106 = x*x22; 
+      const auto x107 = x38*z; 
+      const auto x108 = x54*x8; 
+      const auto x109 = x*x9; 
+      const auto x110 = x18*x42; 
+      const auto x111 = 2.0*radial_eval_alpha; 
+      const auto x112 = x111*x13; 
+      const auto x113 = radial_eval_alpha + radial_eval_alpha_squared*x4; 
+      const auto x114 = x113*x8; 
+      const auto x115 = x112 + x114; 
+      const auto x116 = x113*x13; 
+      const auto x117 = 12.0*radial_eval_alpha; 
+      const auto x118 = x117*x4; 
+      const auto x119 = x118 + x47; 
+      const auto x120 = x111*x50 + x113*x21; 
+      const auto x121 = x6 + x81; 
+      const auto x122 = x113*x36 + x121*x54 + 24.0*x57*x58; 
+      const auto x123 = -18.0*radial_eval; 
+      const auto x124 = x113*x28; 
+      const auto x125 = x111*x60 + x124; 
+      const auto x126 = -x4; 
+      const auto x127 = x126 + x18; 
+      const auto x128 = 8.0*x58; 
+      const auto x129 = x113*x39 + x127*x54 + x128*x64; 
+      const auto x130 = x111*x65; 
+      const auto x131 = x113*x42 + x130; 
+      const auto x132 = x108 + x113*x43 + x128*x70; 
+      const auto x133 = radial_eval_alpha*x3; 
+      const auto x134 = radial_eval_alpha*x5; 
+      const auto x135 = 6.0*radial_eval_alpha; 
+      const auto x136 = x135*x6; 
+      const auto x137 = radial_eval_alpha*x65; 
+      const auto x138 = -x12 - x77; 
+      const auto x139 = 24.0*radial_eval; 
+      const auto x140 = x*x139; 
+      const auto x141 = x140*y; 
+      const auto x142 = 12.0*x58; 
+      const auto x143 = 12.0*x87; 
+      const auto x144 = radial_eval_alpha_squared*x; 
+      const auto x145 = x144*y; 
+      const auto x146 = -x135*x4 + x52; 
+      const auto x147 = radial_eval_alpha*x56; 
+      const auto x148 = radial_eval_alpha*x85; 
+      const auto x149 = x*x94; 
+      const auto x150 = x135*x18; 
+      const auto x151 = -x150; 
+      const auto x152 = x*x96*(radial_eval_alpha*x99 + radial_eval_alpha_squared*x100 + x151 + x52); 
+      const auto x153 = 96.0*radial_eval*z; 
+      const auto x154 = 12.0*x105; 
+      const auto x155 = x144*z; 
+      const auto x156 = -x59 - x98; 
+      const auto x157 = radial_eval_alpha*x17; 
+      const auto x158 = x142*x8; 
+      const auto x159 = 4.0*radial_eval_alpha; 
+      const auto x160 = x157*x65; 
+      const auto x161 = x69*z; 
+      const auto x162 = x111*x42; 
+      const auto x163 = radial_eval_alpha + radial_eval_alpha_squared*x6; 
+      const auto x164 = x163*x8; 
+      const auto x165 = x162 + x164; 
+      const auto x166 = x13*x163 + x130; 
+      const auto x167 = x111*x78 + x163*x21; 
+      const auto x168 = x163*x28; 
+      const auto x169 = x111*x82 + x168; 
+      const auto x170 = x27 + x4; 
+      const auto x171 = x163*x36 + x170*x54 + 24.0*x86*x87; 
+      const auto x172 = x117*x6; 
+      const auto x173 = x172 + x47; 
+      const auto x174 = -x18 + x6; 
+      const auto x175 = 8.0*x87; 
+      const auto x176 = x163*x39 + x174*x54 + x175*x89; 
+      const auto x177 = x126 + x6; 
+      const auto x178 = x163*x43 + x175*x92 + x177*x54; 
+      const auto x179 = -x12 - x80 - x97; 
+      const auto x180 = radial_eval_alpha_squared*y; 
+      const auto x181 = x180*z; 
+      const auto x182 = x143*x8; 
+      const auto x183 = radial_eval_alpha + radial_eval_alpha_squared*x18; 
+      const auto x184 = x183*x8; 
+      const auto x185 = x13*x183; 
+      const auto x186 = x112 + x185; 
+      const auto x187 = 24.0*radial_eval_alpha*x18; 
+      const auto x188 = x183*x21 + x187; 
+      const auto x189 = x111*x99 + x183*x28; 
+      const auto x190 = x139 + x189; 
+      const auto x191 = 2.0*x18 - x4 - x6; 
+      const auto x192 = 48.0*radial_eval*x191 + 32.0*x104*x105 + x183*x36; 
+      const auto x193 = x108 + 24.0*x157*x8 + x183*x39; 
+      const auto x194 = x183*x42; 
+      const auto x195 = x162 + x194; 
+      const auto x196 = x183*x43; 
+      const auto x197 = x118 + x166; 
+      const auto x198 = x116 + x197; 
+      const auto x199 = -x118; 
+      const auto x200 = -x172; 
+      const auto x201 = x163*x42; 
+      const auto x202 = x131 + x200; 
+      const auto x203 = x201 + x202; 
+      const auto x204 = radial_eval_alpha_cubed*x55 + radial_eval_alpha_squared*x63; 
+      const auto x205 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; 
+      const auto x206 = x205*x8; 
+      const auto x207 = radial_eval_alpha_cubed*x18 + radial_eval_alpha_squared; 
+      const auto x208 = x207*x8; 
+      const auto x209 = 2.0*radial_eval_alpha_squared; 
+      const auto x210 = x209*x3; 
+      const auto x211 = 36.0*x58; 
+      const auto x212 = 18.0*x*x113; 
+      const auto x213 = 6.0*x; 
+      const auto x214 = x163*x213; 
+      const auto x215 = x183*x213; 
+      const auto x216 = 2.0*x144; 
+      const auto x217 = x13*x205; 
+      const auto x218 = x13*x207; 
+      const auto x219 = x205*x21; 
+      const auto x220 = x207*x21; 
+      const auto x221 = 24.0*radial_eval_alpha_squared; 
+      const auto x222 = x111*x138 + x187; 
+      const auto x223 = x205*x28; 
+      const auto x224 = x207*x28; 
+      const auto x225 = x204*x28; 
+      const auto x226 = 48.0*x58; 
+      const auto x227 = x226*x6; 
+      const auto x228 = 24.0*x145; 
+      const auto x229 = x205*x36; 
+      const auto x230 = x207*x36; 
+      const auto x231 = 36.0*radial_eval_alpha; 
+      const auto x232 = x111*x156; 
+      const auto x233 = 12.0*radial_eval_alpha_squared; 
+      const auto x234 = x233*x32; 
+      const auto x235 = -x234; 
+      const auto x236 = x200 + x235; 
+      const auto x237 = 8.0*x145; 
+      const auto x238 = 24.0*x17; 
+      const auto x239 = x205*x39; 
+      const auto x240 = x207*x39; 
+      const auto x241 = x163*x65; 
+      const auto x242 = x113*x65; 
+      const auto x243 = x205*x42; 
+      const auto x244 = x207*x42; 
+      const auto x245 = x118 + x130 + x183*x65; 
+      const auto x246 = x205*x43; 
+      const auto x247 = x207*x43; 
+      const auto x248 = radial_eval_alpha_cubed*x84 + radial_eval_alpha_squared*x88; 
+      const auto x249 = radial_eval_alpha_cubed*x4 + radial_eval_alpha_squared; 
+      const auto x250 = x249*x8; 
+      const auto x251 = x209*x5; 
+      const auto x252 = x13*x249; 
+      const auto x253 = x21*x249; 
+      const auto x254 = x248*x28; 
+      const auto x255 = x249*x28; 
+      const auto x256 = x111*x179 + x199; 
+      const auto x257 = 48.0*x87; 
+      const auto x258 = x257*x4; 
+      const auto x259 = 36.0*x87; 
+      const auto x260 = x249*x36; 
+      const auto x261 = 2.0*x180; 
+      const auto x262 = 6.0*y; 
+      const auto x263 = -x113*x262 - 18.0*x163*y - x183*x262 - x259; 
+      const auto x264 = x249*x39; 
+      const auto x265 = x249*x42; 
+      const auto x266 = x249*x43; 
+      const auto x267 = x209*z; 
+      const auto x268 = radial_eval_alpha_cubed*x102 + radial_eval_alpha_squared*x103; 
+      const auto x269 = x17*x209; 
+      const auto x270 = x269*x65; 
+      const auto x271 = x233*x34; 
+      const auto x272 = 12.0*z; 
+      const auto x273 = 36.0*z; 
+      const auto x274 = 48.0*radial_eval_alpha*x18 + x113*x99 + x163*x99 + x17*x223 + x17*x255 + 3.0*x183*x99 + x268*x28*z; 
+      const auto x275 = 192.0*x105; 
+      const auto x276 = -x233*x35; 
+      const auto x277 = 48.0*x105; 
+      const auto x278 = 8.0*x181; 
+      const auto x279 = 8.0*x155; 
+
+
+      // Evaluate basis function
+      basis_eval[ipt + 0*npts] = radial_eval*x2*x8;
+      basis_eval[ipt + 1*npts] = radial_eval*x11*x13;
+      basis_eval[ipt + 2*npts] = radial_eval*x16*x21;
+      basis_eval[ipt + 3*npts] = radial_eval*x24*x28;
+      basis_eval[ipt + 4*npts] = x29*x36;
+      basis_eval[ipt + 5*npts] = radial_eval*x28*x37;
+      basis_eval[ipt + 6*npts] = radial_eval*x38*x39;
+      basis_eval[ipt + 7*npts] = radial_eval*x40*x42;
+      basis_eval[ipt + 8*npts] = sqrt_35*x29*x43;
+
+
+    
+      // Evaluate first derivative of bfn wrt x
+      basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x45 + x44);
+      basis_x_eval[ipt + 1*npts] = x46*x49;
+      basis_x_eval[ipt + 2*npts] = x15*(radial_eval*x50 + radial_eval_alpha*x51);
+      basis_x_eval[ipt + 3*npts] = x53;
+      basis_x_eval[ipt + 4*npts] = 0.125*x36*x58 + 0.125*x54*x57;
+      basis_x_eval[ipt + 5*npts] = x23*(radial_eval*x60 + radial_eval_alpha*x61);
+      basis_x_eval[ipt + 6*npts] = x38*(x39*x58 + x62*x64);
+      basis_x_eval[ipt + 7*npts] = x10*x68;
+      basis_x_eval[ipt + 8*npts] = x69*(x43*x58 + x62*x70);
+
+      // Evaluate first derivative of bfn wrt y
+      basis_y_eval[ipt + 0*npts] = x71*(radial_eval_alpha*x73 + x72);
+      basis_y_eval[ipt + 1*npts] = x10*x75;
+      basis_y_eval[ipt + 2*npts] = x76*(radial_eval*x78 + radial_eval_alpha*x79);
+      basis_y_eval[ipt + 3*npts] = x23*(radial_eval*x82 + radial_eval_alpha*x83);
+      basis_y_eval[ipt + 4*npts] = 0.125*x36*x87 + 0.125*x54*x86;
+      basis_y_eval[ipt + 5*npts] = x53;
+      basis_y_eval[ipt + 6*npts] = x38*(x39*x87 + x62*x89);
+      basis_y_eval[ipt + 7*npts] = x46*x91;
+      basis_y_eval[ipt + 8*npts] = x69*(x43*x87 + x62*x92);
+
+      // Evaluate first derivative of bfn wrt z
+      basis_z_eval[ipt + 0*npts] = x58*x8*x93;
+      basis_z_eval[ipt + 1*npts] = x94*(radial_eval_alpha*x95 + x44);
+      basis_z_eval[ipt + 2*npts] = x16*z*(radial_eval_alpha*x21 + x54);
+      basis_z_eval[ipt + 3*npts] = x101*x96;
+      basis_z_eval[ipt + 4*npts] = 2.0*radial_eval*x104 + 0.125*x105*x36;
+      basis_z_eval[ipt + 5*npts] = x101*x106;
+      basis_z_eval[ipt + 6*npts] = x107*(radial_eval_alpha*x39 + x108);
+      basis_z_eval[ipt + 7*npts] = x109*(radial_eval_alpha*x110 + x72);
+      basis_z_eval[ipt + 8*npts] = x105*x43*x69;
+
+      // Evaluate second derivative of bfn wrt xx
+      basis_xx_eval[ipt + 0*npts] = x2*(x115 + x47);
+      basis_xx_eval[ipt + 1*npts] = x11*(x116 + x119);
+      basis_xx_eval[ipt + 2*npts] = x16*(x120 + x52);
+      basis_xx_eval[ipt + 3*npts] = x24*(x113*x28 - x119);
+      basis_xx_eval[ipt + 4*npts] = 0.125*x122;
+      basis_xx_eval[ipt + 5*npts] = x37*(x123 + x125);
+      basis_xx_eval[ipt + 6*npts] = x129*x38;
+      basis_xx_eval[ipt + 7*npts] = x40*(x131 + x47);
+      basis_xx_eval[ipt + 8*npts] = x132*x69;
+
+      // Evaluate second derivative of bfn wrt xy
+      basis_xy_eval[ipt + 0*npts] = x0*(radial_eval_alpha_squared*x4*x6*x8 + x13*x134 + x133*x42 + x66);
+      basis_xy_eval[ipt + 1*npts] = x40*(radial_eval_alpha_squared*x74 + x136 + x137 + x47);
+      basis_xy_eval[ipt + 2*npts] = x14*(radial_eval*x138 + radial_eval_alpha_squared*x21*x4*x6 + x133*x78 + x134*x50);
+      basis_xy_eval[ipt + 3*npts] = x37*(radial_eval_alpha*x82 + radial_eval_alpha_squared*x83 - x136 + x52);
+      basis_xy_eval[ipt + 4*npts] = 0.125*x141 + 0.125*x142*x86 + 0.125*x143*x57 + 0.125*x145*x36;
+      basis_xy_eval[ipt + 5*npts] = x24*(radial_eval_alpha*x60 + radial_eval_alpha_squared*x61 + x146);
+      basis_xy_eval[ipt + 6*npts] = x38*(x145*x39 + x147*x89 + x148*x64);
+      basis_xy_eval[ipt + 7*npts] = x11*(radial_eval_alpha_squared*x67 + x137 + x146);
+      basis_xy_eval[ipt + 8*npts] = x69*(-x141 + x145*x43 + x147*x92 + x148*x70);
+
+      // Evaluate second derivative of bfn wrt xz
+      basis_xz_eval[ipt + 0*npts] = x93*(radial_eval_alpha_squared*x45 + x48);
+      basis_xz_eval[ipt + 1*npts] = x149*(radial_eval_alpha_squared*x95 + x150 + x49);
+      basis_xz_eval[ipt + 2*npts] = x15*z*(radial_eval_alpha*x50 + radial_eval_alpha_squared*x51 + x118 + x54);
+      basis_xz_eval[ipt + 3*npts] = x152;
+      basis_xz_eval[ipt + 4*npts] = -0.125*x*x153 + 2.0*x104*x58 + 0.125*x154*x57 + 0.125*x155*x36;
+      basis_xz_eval[ipt + 5*npts] = x22*(radial_eval*x156 + radial_eval_alpha_squared*x18*x28*x4 + x133*x99 + x157*x60);
+      basis_xz_eval[ipt + 6*npts] = x107*(x140 + x144*x39 + x158 + x159*x64);
+      basis_xz_eval[ipt + 7*npts] = x9*(radial_eval_alpha_squared*x18*x4*x42 + x160 + x68);
+      basis_xz_eval[ipt + 8*npts] = x161*(x144*x43 + x159*x70);
+
+      // Evaluate second derivative of bfn wrt yy
+      basis_yy_eval[ipt + 0*npts] = x2*(x165 + x52);
+      basis_yy_eval[ipt + 1*npts] = x11*(x166 + x52);
+      basis_yy_eval[ipt + 2*npts] = x16*(x167 + x52);
+      basis_yy_eval[ipt + 3*npts] = x24*(x123 + x169);
+      basis_yy_eval[ipt + 4*npts] = 0.125*x171;
+      basis_yy_eval[ipt + 5*npts] = x37*(x163*x28 - x173);
+      basis_yy_eval[ipt + 6*npts] = x176*x38;
+      basis_yy_eval[ipt + 7*npts] = x40*(x163*x42 - x173);
+      basis_yy_eval[ipt + 8*npts] = x178*x69;
+
+      // Evaluate second derivative of bfn wrt yz
+      basis_yz_eval[ipt + 0*npts] = x71*z*(radial_eval_alpha_squared*x73 + x90);
+      basis_yz_eval[ipt + 1*npts] = x9*(radial_eval_alpha_squared*x13*x18*x6 + x160 + x75);
+      basis_yz_eval[ipt + 2*npts] = x76*z*(radial_eval_alpha*x78 + radial_eval_alpha_squared*x79 + x172 + x54);
+      basis_yz_eval[ipt + 3*npts] = x22*(radial_eval*x179 + radial_eval_alpha_squared*x18*x28*x6 + x134*x99 + x157*x82);
+      basis_yz_eval[ipt + 4*npts] = 2.0*x104*x87 - 0.125*x153*y + 0.125*x154*x86 + 0.125*x181*x36;
+      basis_yz_eval[ipt + 5*npts] = x152;
+      basis_yz_eval[ipt + 6*npts] = x107*(-x139*y + x159*x89 + x180*x39 + x182);
+      basis_yz_eval[ipt + 7*npts] = x149*(radial_eval_alpha_squared*x110 + x151 + x91);
+      basis_yz_eval[ipt + 8*npts] = x161*(x159*x92 + x180*x43);
+
+      // Evaluate second derivative of bfn wrt zz
+      basis_zz_eval[ipt + 0*npts] = x184*x2;
+      basis_zz_eval[ipt + 1*npts] = x11*x186;
+      basis_zz_eval[ipt + 2*npts] = x16*(x188 + x54);
+      basis_zz_eval[ipt + 3*npts] = x190*x24;
+      basis_zz_eval[ipt + 4*npts] = 0.125*x192;
+      basis_zz_eval[ipt + 5*npts] = x190*x37;
+      basis_zz_eval[ipt + 6*npts] = x193*x38;
+      basis_zz_eval[ipt + 7*npts] = x195*x40;
+      basis_zz_eval[ipt + 8*npts] = x196*x69;
+
+      // Evaluate Laplacian of bfn 
+      basis_lapl_eval[ipt + 0*npts] = x2*(x115 + x165 + x184);
+      basis_lapl_eval[ipt + 1*npts] = x11*(x186 + x198);
+      basis_lapl_eval[ipt + 2*npts] = x16*(x120 + x167 + x188);
+      basis_lapl_eval[ipt + 3*npts] = x24*(x124 + x169 + x189 + x199);
+      basis_lapl_eval[ipt + 4*npts] = 0.125*x122 + 0.125*x171 + 0.125*x192;
+      basis_lapl_eval[ipt + 5*npts] = x37*(x125 + x168 + x189 + x200);
+      basis_lapl_eval[ipt + 6*npts] = x38*(x129 + x176 + x193);
+      basis_lapl_eval[ipt + 7*npts] = x40*(x195 + x203);
+      basis_lapl_eval[ipt + 8*npts] = x69*(x132 + x178 + x196);
+
+      // Evaluate Laplacian gradient of bfn (dx)
+      basis_lapl_x_eval[ipt + 0*npts] = x1*(x*x204*x8 + 3.0*x116 + x185 + x197 + x206*x3 + x208*x3 + x210*x42);
+      basis_lapl_x_eval[ipt + 1*npts] = x11*(x*x217 + x*x218 + x13*x204 + x13*x216 + x211 + x212 + x214 + x215 + x216*x65);
+      basis_lapl_x_eval[ipt + 2*npts] = x15*(x*x204*x21 + 3.0*x113*x50 + x163*x50 + x183*x50 + x199 + x210*x78 + x219*x3 + x220*x3 + x221*x34 + x222);
+      basis_lapl_x_eval[ipt + 3*npts] = x24*(x*x223 + x*x224 - x211 - x212 - x214 - x215 + x216*x82 + x216*x99 + x225);
+      basis_lapl_x_eval[ipt + 4*npts] = 0.125*x*x229 + 0.125*x*x230 + 4.0*x104*x155 + 4.5*x113*x57 + 0.125*x121*x211 + 0.125*x142*x170 + 1.5*x163*x57 - 24.0*x18*x58 + 1.5*x183*x57 + 0.125*x191*x226 + 0.125*x204*x36 + 0.125*x227 + 0.125*x228*x86;
+      basis_lapl_x_eval[ipt + 5*npts] = x23*(x*x225 + 3.0*x113*x60 + x163*x60 + x183*x60 + x210*x99 + x223*x3 + x224*x3 - x231*x4 + x232 + x236);
+      basis_lapl_x_eval[ipt + 6*npts] = x38*(x*x239 + x*x240 + 12.0*x113*x64 + x127*x211 + x142*x174 + x144*x238*x8 + x158 + 4.0*x163*x64 + x18*x226 + 4.0*x183*x64 + x204*x39 + x237*x89);
+      basis_lapl_x_eval[ipt + 7*npts] = x10*(x*x204*x42 + x209*x67 + x236 + x241 + 3.0*x242 + x243*x3 + x244*x3 + x245);
+      basis_lapl_x_eval[ipt + 8*npts] = x69*(x*x246 + x*x247 + 12.0*x113*x70 + x142*x177 + 4.0*x163*x70 + 4.0*x183*x70 + x204*x43 + x211*x8 - x227 + x237*x92);
+      // Evaluate Laplacian gradient of bfn (dy)
+      basis_lapl_y_eval[ipt + 0*npts] = x71*(x13*x251 + x194 + 3.0*x201 + x202 + x208*x5 + x248*x8*y + x250*x5);
+      basis_lapl_y_eval[ipt + 1*npts] = x10*(x13*x248*y + x200 + x209*x74 + x218*x5 + x234 + 3.0*x241 + x242 + x245 + x252*x5);
+      basis_lapl_y_eval[ipt + 2*npts] = x76*(x113*x78 + 3.0*x163*x78 + x183*x78 + x200 + x21*x248*y + x220*x5 + x221*x35 + x222 + x251*x50 + x253*x5);
+      basis_lapl_y_eval[ipt + 3*npts] = x23*(x113*x82 + 3.0*x163*x82 + x183*x82 + x224*x5 - x231*x6 + x235 + x251*x99 + x254*y + x255*x5 + x256);
+      basis_lapl_y_eval[ipt + 4*npts] = 4.0*x104*x181 + 1.5*x113*x86 + 0.125*x121*x143 + 4.5*x163*x86 + 0.125*x170*x259 - 24.0*x18*x87 + 1.5*x183*x86 + 0.125*x191*x257 + 0.125*x228*x57 + 0.125*x230*y + 0.125*x248*x36 + 0.125*x258 + 0.125*x260*y;
+      basis_lapl_y_eval[ipt + 5*npts] = x37*(x224*y + x254 + x255*y + x261*x60 + x261*x99 + x263);
+      basis_lapl_y_eval[ipt + 6*npts] = x38*(4.0*x113*x89 + x127*x143 + 12.0*x163*x89 + x174*x259 - x18*x257 + x180*x238*x8 + x182 + 4.0*x183*x89 + x237*x64 + x240*y + x248*x39 + x264*y);
+      basis_lapl_y_eval[ipt + 7*npts] = x40*(x244*y + x248*x42 + x261*x42 + x261*x65 + x263 + x265*y);
+      basis_lapl_y_eval[ipt + 8*npts] = x69*(4.0*x113*x92 + 12.0*x163*x92 + x177*x259 + x182 + 4.0*x183*x92 + x237*x70 + x247*y + x248*x43 - x258 + x266*y);
+      // Evaluate Laplacian gradient of bfn (dz)
+      basis_lapl_z_eval[ipt + 0*npts] = x2*(x13*x267 + x206*z + x250*z + x267*x42 + x268*x8);
+      basis_lapl_z_eval[ipt + 1*npts] = x94*(x13*x268*z + x17*x217 + x17*x252 + 3.0*x185 + x198 + x270 + x271);
+      basis_lapl_z_eval[ipt + 2*npts] = x16*(72.0*x105 + x113*x272 + x163*x272 + x183*x273 + x21*x268 + x219*z + x253*z + x267*x50 + x267*x78);
+      basis_lapl_z_eval[ipt + 3*npts] = x96*(x256 + x269*x82 - x271 + x274);
+      basis_lapl_z_eval[ipt + 4*npts] = 2.0*x104*x113 + 2.0*x104*x163 + 6.0*x104*x183 + 18.0*x105*x191 + 0.125*x121*x154 + 0.125*x154*x170 + 3.0*x155*x57 + 3.0*x181*x86 + 0.125*x229*z + 0.125*x260*z + 0.125*x268*x36 - 0.125*x275*x4 - 0.125*x275*x6;
+      basis_lapl_z_eval[ipt + 5*npts] = x106*(x200 + x232 + x269*x60 + x274 + x276);
+      basis_lapl_z_eval[ipt + 6*npts] = x38*(36.0*x105*x8 + x114*x272 + x127*x154 + x154*x174 + x164*x272 + x184*x273 + x239*z + x264*z + x268*x39 + x277*x4 - x277*x6 + x278*x89 + x279*x64);
+      basis_lapl_z_eval[ipt + 7*npts] = x109*(x17*x243 + x17*x265 + 3.0*x194 + x203 + x268*x42*z + x270 + x276);
+      basis_lapl_z_eval[ipt + 8*npts] = x69*(x154*x177 + x154*x8 + x246*z + x266*z + x268*x43 + x278*x92 + x279*x70);
+
+
+
+
+#if 0
+      // Evaluate the angular part of bfn
+
+
+
+      double ang_eval_0;
+      double ang_eval_1;
+      double ang_eval_2;
+      double ang_eval_3;
+
+
+      ang_eval_0 = radial_eval*x2*x8;
+      ang_eval_1 = radial_eval*x11*x13;
+      ang_eval_2 = radial_eval*x16*x21;
+      ang_eval_3 = radial_eval*x24*x28;
+      basis_eval[ipt + 0*npts] = ang_eval_0;
+      basis_eval[ipt + 1*npts] = ang_eval_1;
+      basis_eval[ipt + 2*npts] = ang_eval_2;
+      basis_eval[ipt + 3*npts] = ang_eval_3;
+
+      ang_eval_0 = x29*x36;
+      ang_eval_1 = radial_eval*x28*x37;
+      ang_eval_2 = radial_eval*x38*x39;
+      ang_eval_3 = radial_eval*x40*x42;
+      basis_eval[ipt + 4*npts] = ang_eval_0;
+      basis_eval[ipt + 5*npts] = ang_eval_1;
+      basis_eval[ipt + 6*npts] = ang_eval_2;
+      basis_eval[ipt + 7*npts] = ang_eval_3;
+
+      ang_eval_0 = sqrt_35*x29*x43;
+      basis_eval[ipt + 8*npts] = ang_eval_0;
+
+
+      double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0;
+      double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1;
+      double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
+      double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
+
+      dang_eval_x_0 = x1*(radial_eval_alpha*x45 + x44);
+      dang_eval_y_0 = x71*(radial_eval_alpha*x73 + x72);
+      dang_eval_z_0 = x58*x8*x93;
+      dang_eval_x_1 = x46*x49;
+      dang_eval_y_1 = x10*x75;
+      dang_eval_z_1 = x94*(radial_eval_alpha*x95 + x44);
+      dang_eval_x_2 = x15*(radial_eval*x50 + radial_eval_alpha*x51);
+      dang_eval_y_2 = x76*(radial_eval*x78 + radial_eval_alpha*x79);
+      dang_eval_z_2 = x16*z*(radial_eval_alpha*x21 + x54);
+      dang_eval_x_3 = x53;
+      dang_eval_y_3 = x23*(radial_eval*x82 + radial_eval_alpha*x83);
+      dang_eval_z_3 = x101*x96;
+      basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 1*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 1*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 1*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 2*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 2*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 2*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 3*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = 0.125*x36*x58 + 0.125*x54*x57;
+      dang_eval_y_0 = 0.125*x36*x87 + 0.125*x54*x86;
+      dang_eval_z_0 = 2.0*radial_eval*x104 + 0.125*x105*x36;
+      dang_eval_x_1 = x23*(radial_eval*x60 + radial_eval_alpha*x61);
+      dang_eval_y_1 = x53;
+      dang_eval_z_1 = x101*x106;
+      dang_eval_x_2 = x38*(x39*x58 + x62*x64);
+      dang_eval_y_2 = x38*(x39*x87 + x62*x89);
+      dang_eval_z_2 = x107*(radial_eval_alpha*x39 + x108);
+      dang_eval_x_3 = x10*x68;
+      dang_eval_y_3 = x46*x91;
+      dang_eval_z_3 = x109*(radial_eval_alpha*x110 + x72);
+      basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
+      basis_x_eval[ipt + 5*npts] = dang_eval_x_1;
+      basis_y_eval[ipt + 5*npts] = dang_eval_y_1;
+      basis_z_eval[ipt + 5*npts] = dang_eval_z_1;
+      basis_x_eval[ipt + 6*npts] = dang_eval_x_2;
+      basis_y_eval[ipt + 6*npts] = dang_eval_y_2;
+      basis_z_eval[ipt + 6*npts] = dang_eval_z_2;
+      basis_x_eval[ipt + 7*npts] = dang_eval_x_3;
+      basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
+      basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
+
+      dang_eval_x_0 = x69*(x43*x58 + x62*x70);
+      dang_eval_y_0 = x69*(x43*x87 + x62*x92);
+      dang_eval_z_0 = x105*x43*x69;
+      basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
+      basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
+      basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
+
+#endif
+    } // Loop over points within task
+  } // Loop over tasks
+        
+  } // Loop over shells
+} // end kernel
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp
index 6f129915..f5b3c77c 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,15 +19,15 @@
 namespace GauXC {
 
 
-__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_4(
+__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_4(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks
 ) {
 
 
-  __shared__ double alpha[16][detail::shell_nprim_max + 1]; 
-  __shared__ double coeff[16][detail::shell_nprim_max + 1];
+  __shared__ double alpha[4][detail::shell_nprim_max + 1]; 
+  __shared__ double coeff[4][detail::shell_nprim_max + 1];
   double* my_alpha = alpha[threadIdx.x/32];
   double* my_coeff = coeff[threadIdx.x/32];
 
@@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
     auto* __restrict__ basis_x_eval = task->dbfx + shoff;
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
-
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 
     // Loop over points in task
@@ -103,64 +106,243 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       radial_eval_alpha *= -2;
       radial_eval_alpha_squared *= 4;
 
-      
+      // Common Subexpressions
+      const auto x0 = 0.5*sqrt_35; 
+      const auto x1 = x0*y; 
+      const auto x2 = x*x1; 
+      const auto x3 = x*x; 
+      const auto x4 = x3; 
+      const auto x5 = y*y; 
+      const auto x6 = x5; 
+      const auto x7 = -x6; 
+      const auto x8 = x4 + x7; 
+      const auto x9 = 0.25*sqrt_70; 
+      const auto x10 = x9*z; 
+      const auto x11 = x10*y; 
+      const auto x12 = 3.0*x4; 
+      const auto x13 = x12 + x7; 
+      const auto x14 = 0.5*sqrt_5; 
+      const auto x15 = x14*y; 
+      const auto x16 = x*x15; 
+      const auto x17 = z*z; 
+      const auto x18 = x17; 
+      const auto x19 = -6.0*x18; 
+      const auto x20 = x19 + x6; 
+      const auto x21 = -x20 - x4; 
+      const auto x22 = 0.25*sqrt_10; 
+      const auto x23 = x22*z; 
+      const auto x24 = x23*y; 
+      const auto x25 = -4.0*x18; 
+      const auto x26 = 3.0*x6; 
+      const auto x27 = x25 + x26; 
+      const auto x28 = -x12 - x27; 
+      const auto x29 = 0.125*radial_eval; 
+      const auto x30 = x*x*x*x; 
+      const auto x31 = y*y*y*y; 
+      const auto x32 = 6.0*x4*x6; 
+      const auto x33 = x18*x4; 
+      const auto x34 = x18*x6; 
+      const auto x35 = 3.0*x30 + 3.0*x31 + x32 - 24.0*x33 - 24.0*x34 + 8.0*(z*z*z*z); 
+      const auto x36 = x*x23; 
+      const auto x37 = 0.25*sqrt_5; 
+      const auto x38 = -x30 + x31 + 6.0*x33 - 6.0*x34; 
+      const auto x39 = x*x10; 
+      const auto x40 = -x26; 
+      const auto x41 = x4 + x40; 
+      const auto x42 = x30 + x31 - x32; 
+      const auto x43 = radial_eval*x13; 
+      const auto x44 = x4*x8; 
+      const auto x45 = x*x11; 
+      const auto x46 = 6.0*radial_eval; 
+      const auto x47 = radial_eval_alpha*x13; 
+      const auto x48 = x46 + x47; 
+      const auto x49 = -x12 - x20; 
+      const auto x50 = x21*x4; 
+      const auto x51 = -x46; 
+      const auto x52 = x*x24*(radial_eval_alpha*x28 + x51); 
+      const auto x53 = 12.0*radial_eval; 
+      const auto x54 = x*x*x; 
+      const auto x55 = 4.0*x; 
+      const auto x56 = x*x6 - x18*x55 + x54; 
+      const auto x57 = radial_eval_alpha*x; 
+      const auto x58 = 9.0*x4; 
+      const auto x59 = -x27 - x58; 
+      const auto x60 = x28*x4; 
+      const auto x61 = 4.0*radial_eval; 
+      const auto x62 = 3.0*x; 
+      const auto x63 = x18*x62 - x54; 
+      const auto x64 = x12 + x40; 
+      const auto x65 = radial_eval*x64; 
+      const auto x66 = x4*x41; 
+      const auto x67 = radial_eval_alpha*x66 + x65; 
+      const auto x68 = 0.125*sqrt_35; 
+      const auto x69 = x54 - x6*x62; 
+      const auto x70 = x*x0; 
+      const auto x71 = radial_eval*x41; 
+      const auto x72 = x6*x8; 
+      const auto x73 = x13*x6; 
+      const auto x74 = radial_eval_alpha*x73 + x65; 
+      const auto x75 = x*x14; 
+      const auto x76 = x19 + x26; 
+      const auto x77 = -x4 - x76; 
+      const auto x78 = x21*x6; 
+      const auto x79 = 9.0*x6; 
+      const auto x80 = x12 + x25; 
+      const auto x81 = -x79 - x80; 
+      const auto x82 = x28*x6; 
+      const auto x83 = y*y*y; 
+      const auto x84 = 4.0*y; 
+      const auto x85 = -x18*x84 + x4*y + x83; 
+      const auto x86 = radial_eval_alpha*y; 
+      const auto x87 = 3.0*y; 
+      const auto x88 = -x18*x87 + x83; 
+      const auto x89 = radial_eval_alpha*x41; 
+      const auto x90 = x51 + x89; 
+      const auto x91 = -x4*x87 + x83; 
+      const auto x92 = x1*z; 
+      const auto x93 = x9*y; 
+      const auto x94 = x13*x18; 
+      const auto x95 = x22*y; 
+      const auto x96 = -12.0*x18; 
+      const auto x97 = x26 + x96; 
+      const auto x98 = -x12 - x97; 
+      const auto x99 = x18*x28; 
+      const auto x100 = radial_eval*x98 + radial_eval_alpha*x99; 
+      const auto x101 = 3.0*z; 
+      const auto x102 = -x101*x4 - x101*x6 + 2.0*(z*z*z); 
+      const auto x103 = radial_eval_alpha*z; 
+      const auto x104 = x37*z; 
+      const auto x105 = x53*x8; 
+      const auto x106 = x18*x41; 
+      const auto x107 = 2.0*radial_eval_alpha; 
+      const auto x108 = x107*x13; 
+      const auto x109 = radial_eval_alpha + radial_eval_alpha_squared*x4; 
+      const auto x110 = x108 + x109*x8; 
+      const auto x111 = x109*x13; 
+      const auto x112 = 12.0*radial_eval_alpha; 
+      const auto x113 = x112*x4; 
+      const auto x114 = x113 + x46; 
+      const auto x115 = x107*x49 + x109*x21; 
+      const auto x116 = x109*x35 + x53*(x6 + x80) + 24.0*x56*x57; 
+      const auto x117 = -18.0*radial_eval; 
+      const auto x118 = x109*x28; 
+      const auto x119 = x107*x59 + x118; 
+      const auto x120 = -x4; 
+      const auto x121 = 8.0*x57; 
+      const auto x122 = x109*x38 + x121*x63 + x53*(x120 + x18); 
+      const auto x123 = x107*x64; 
+      const auto x124 = x109*x41 + x123; 
+      const auto x125 = x105 + x109*x42 + x121*x69; 
+      const auto x126 = radial_eval_alpha*x3; 
+      const auto x127 = radial_eval_alpha*x5; 
+      const auto x128 = 6.0*radial_eval_alpha; 
+      const auto x129 = x128*x6; 
+      const auto x130 = radial_eval_alpha*x64; 
+      const auto x131 = 24.0*radial_eval; 
+      const auto x132 = x*x131; 
+      const auto x133 = x132*y; 
+      const auto x134 = 12.0*x57; 
+      const auto x135 = 12.0*x86; 
+      const auto x136 = radial_eval_alpha_squared*x; 
+      const auto x137 = x136*y; 
+      const auto x138 = -x128*x4 + x51; 
+      const auto x139 = radial_eval_alpha*x55; 
+      const auto x140 = radial_eval_alpha*x84; 
+      const auto x141 = x*x93; 
+      const auto x142 = x128*x18; 
+      const auto x143 = -x142; 
+      const auto x144 = x*x95*(radial_eval_alpha*x98 + radial_eval_alpha_squared*x99 + x143 + x51); 
+      const auto x145 = 96.0*radial_eval*z; 
+      const auto x146 = 12.0*x103; 
+      const auto x147 = radial_eval_alpha*x17; 
+      const auto x148 = 4.0*radial_eval_alpha; 
+      const auto x149 = x147*x64; 
+      const auto x150 = x68*z; 
+      const auto x151 = x107*x41; 
+      const auto x152 = radial_eval_alpha + radial_eval_alpha_squared*x6; 
+      const auto x153 = x151 + x152*x8; 
+      const auto x154 = x123 + x13*x152; 
+      const auto x155 = x107*x77 + x152*x21; 
+      const auto x156 = x152*x28; 
+      const auto x157 = x107*x81 + x156; 
+      const auto x158 = x152*x35 + x53*(x27 + x4) + 24.0*x85*x86; 
+      const auto x159 = x112*x6; 
+      const auto x160 = x159 + x46; 
+      const auto x161 = 8.0*x86; 
+      const auto x162 = x152*x38 + x161*x88 - x53*(x18 - x6); 
+      const auto x163 = x152*x42 + x161*x91 + x53*(x120 + x6); 
+      const auto x164 = radial_eval_alpha_squared*y; 
+      const auto x165 = radial_eval_alpha + radial_eval_alpha_squared*x18; 
+      const auto x166 = x165*x8; 
+      const auto x167 = x108 + x13*x165; 
+      const auto x168 = 24.0*radial_eval_alpha*x18 + x165*x21; 
+      const auto x169 = x107*x98 + x165*x28; 
+      const auto x170 = x131 + x169; 
+      const auto x171 = -48.0*radial_eval*(-2.0*x18 + x4 + x6) + 32.0*x102*x103 + x165*x35; 
+      const auto x172 = x105 + 24.0*x147*x8 + x165*x38; 
+      const auto x173 = x151 + x165*x41; 
+      const auto x174 = x165*x42; 
+      const auto x175 = -x159; 
+
 
       // Evaluate basis function
-      basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
-      basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_eval[ipt + 0*npts] = radial_eval*x2*x8;
+      basis_eval[ipt + 1*npts] = radial_eval*x11*x13;
+      basis_eval[ipt + 2*npts] = radial_eval*x16*x21;
+      basis_eval[ipt + 3*npts] = radial_eval*x24*x28;
+      basis_eval[ipt + 4*npts] = x29*x35;
+      basis_eval[ipt + 5*npts] = radial_eval*x28*x36;
+      basis_eval[ipt + 6*npts] = radial_eval*x37*x38;
+      basis_eval[ipt + 7*npts] = radial_eval*x39*x41;
+      basis_eval[ipt + 8*npts] = sqrt_35*x29*x42;
 
 
     
       // Evaluate first derivative of bfn wrt x
-      basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2;
-      basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2;
-      basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x44 + x43);
+      basis_x_eval[ipt + 1*npts] = x45*x48;
+      basis_x_eval[ipt + 2*npts] = x15*(radial_eval*x49 + radial_eval_alpha*x50);
+      basis_x_eval[ipt + 3*npts] = x52;
+      basis_x_eval[ipt + 4*npts] = 0.125*x35*x57 + 0.125*x53*x56;
+      basis_x_eval[ipt + 5*npts] = x23*(radial_eval*x59 + radial_eval_alpha*x60);
+      basis_x_eval[ipt + 6*npts] = x37*(x38*x57 + x61*x63);
+      basis_x_eval[ipt + 7*npts] = x10*x67;
+      basis_x_eval[ipt + 8*npts] = x68*(x42*x57 + x61*x69);
 
       // Evaluate first derivative of bfn wrt y
-      basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2;
-      basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2;
-      basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
+      basis_y_eval[ipt + 0*npts] = x70*(radial_eval_alpha*x72 + x71);
+      basis_y_eval[ipt + 1*npts] = x10*x74;
+      basis_y_eval[ipt + 2*npts] = x75*(radial_eval*x77 + radial_eval_alpha*x78);
+      basis_y_eval[ipt + 3*npts] = x23*(radial_eval*x81 + radial_eval_alpha*x82);
+      basis_y_eval[ipt + 4*npts] = 0.125*x35*x86 + 0.125*x53*x85;
+      basis_y_eval[ipt + 5*npts] = x52;
+      basis_y_eval[ipt + 6*npts] = x37*(x38*x86 + x61*x88);
+      basis_y_eval[ipt + 7*npts] = x45*x90;
+      basis_y_eval[ipt + 8*npts] = x68*(x42*x86 + x61*x91);
 
       // Evaluate first derivative of bfn wrt z
-      basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2;
-      basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4;
-      basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2;
-      basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4;
-      basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      basis_z_eval[ipt + 0*npts] = x57*x8*x92;
+      basis_z_eval[ipt + 1*npts] = x93*(radial_eval_alpha*x94 + x43);
+      basis_z_eval[ipt + 2*npts] = x16*z*(radial_eval_alpha*x21 + x53);
+      basis_z_eval[ipt + 3*npts] = x100*x95;
+      basis_z_eval[ipt + 4*npts] = 2.0*radial_eval*x102 + 0.125*x103*x35;
+      basis_z_eval[ipt + 5*npts] = x*x100*x22;
+      basis_z_eval[ipt + 6*npts] = x104*(radial_eval_alpha*x38 + x105);
+      basis_z_eval[ipt + 7*npts] = x*x9*(radial_eval_alpha*x106 + x71);
+      basis_z_eval[ipt + 8*npts] = x103*x42*x68;
+
 
       // Evaluate Laplacian of bfn 
-      basis_lapl_eval[ipt + 0*npts] = sqrt_35*x*y*(11*radial_eval_alpha*x*x - 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/2;
-      basis_lapl_eval[ipt + 1*npts] = sqrt_70*y*z*(33*radial_eval_alpha*x*x - 11*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*x*x + 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/4;
-      basis_lapl_eval[ipt + 2*npts] = sqrt_5*x*y*(-11*radial_eval_alpha*x*x - 11*radial_eval_alpha*y*y + 66*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + 5*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y + 5*radial_eval_alpha_squared*y*y*z*z + 6*radial_eval_alpha_squared*z*z*z*z)/2;
-      basis_lapl_eval[ipt + 3*npts] = sqrt_10*y*z*(-33*radial_eval_alpha*x*x - 33*radial_eval_alpha*y*y + 44*radial_eval_alpha*z*z - 3*radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4;
-      basis_lapl_eval[ipt + 4*npts] = 33*radial_eval_alpha*x*x*x*x/8 + 33*radial_eval_alpha*x*x*y*y/4 - 33*radial_eval_alpha*x*x*z*z + 33*radial_eval_alpha*y*y*y*y/8 - 33*radial_eval_alpha*y*y*z*z + 11*radial_eval_alpha*z*z*z*z + 3*radial_eval_alpha_squared*x*x*x*x*x*x/8 + 9*radial_eval_alpha_squared*x*x*x*x*y*y/8 - 21*radial_eval_alpha_squared*x*x*x*x*z*z/8 + 9*radial_eval_alpha_squared*x*x*y*y*y*y/8 - 21*radial_eval_alpha_squared*x*x*y*y*z*z/4 - 2*radial_eval_alpha_squared*x*x*z*z*z*z + 3*radial_eval_alpha_squared*y*y*y*y*y*y/8 - 21*radial_eval_alpha_squared*y*y*y*y*z*z/8 - 2*radial_eval_alpha_squared*y*y*z*z*z*z + radial_eval_alpha_squared*z*z*z*z*z*z;
-      basis_lapl_eval[ipt + 5*npts] = sqrt_10*x*z*(-33*radial_eval_alpha*x*x - 33*radial_eval_alpha*y*y + 44*radial_eval_alpha*z*z - 3*radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4;
-      basis_lapl_eval[ipt + 6*npts] = sqrt_5*(-11*radial_eval_alpha*x*x*x*x + 66*radial_eval_alpha*x*x*z*z + 11*radial_eval_alpha*y*y*y*y - 66*radial_eval_alpha*y*y*z*z - radial_eval_alpha_squared*x*x*x*x*x*x - radial_eval_alpha_squared*x*x*x*x*y*y + 5*radial_eval_alpha_squared*x*x*x*x*z*z + radial_eval_alpha_squared*x*x*y*y*y*y + 6*radial_eval_alpha_squared*x*x*z*z*z*z + radial_eval_alpha_squared*y*y*y*y*y*y - 5*radial_eval_alpha_squared*y*y*y*y*z*z - 6*radial_eval_alpha_squared*y*y*z*z*z*z)/4;
-      basis_lapl_eval[ipt + 7*npts] = sqrt_70*x*z*(11*radial_eval_alpha*x*x - 33*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y - 3*radial_eval_alpha_squared*y*y*z*z)/4;
-      basis_lapl_eval[ipt + 8*npts] = sqrt_35*(11*radial_eval_alpha*x*x*x*x - 66*radial_eval_alpha*x*x*y*y + 11*radial_eval_alpha*y*y*y*y + radial_eval_alpha_squared*x*x*x*x*x*x - 5*radial_eval_alpha_squared*x*x*x*x*y*y + radial_eval_alpha_squared*x*x*x*x*z*z - 5*radial_eval_alpha_squared*x*x*y*y*y*y - 6*radial_eval_alpha_squared*x*x*y*y*z*z + radial_eval_alpha_squared*y*y*y*y*y*y + radial_eval_alpha_squared*y*y*y*y*z*z)/8;
+      basis_lapl_eval[ipt + 0*npts] = x2*(x110 + x153 + x166);
+      basis_lapl_eval[ipt + 1*npts] = x11*(x111 + x113 + x154 + x167);
+      basis_lapl_eval[ipt + 2*npts] = x16*(x115 + x155 + x168);
+      basis_lapl_eval[ipt + 3*npts] = x24*(-x113 + x118 + x157 + x169);
+      basis_lapl_eval[ipt + 4*npts] = 0.125*x116 + 0.125*x158 + 0.125*x171;
+      basis_lapl_eval[ipt + 5*npts] = x36*(x119 + x156 + x169 + x175);
+      basis_lapl_eval[ipt + 6*npts] = x37*(x122 + x162 + x172);
+      basis_lapl_eval[ipt + 7*npts] = x39*(x124 + x152*x41 + x173 + x175);
+      basis_lapl_eval[ipt + 8*npts] = x68*(x125 + x163 + x174);
+
 
 
 
@@ -176,25 +358,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double ang_eval_3;
 
 
-      ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2;
-      ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4;
-      ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2;
-      ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4;
+      ang_eval_0 = radial_eval*x2*x8;
+      ang_eval_1 = radial_eval*x11*x13;
+      ang_eval_2 = radial_eval*x16*x21;
+      ang_eval_3 = radial_eval*x24*x28;
       basis_eval[ipt + 0*npts] = ang_eval_0;
       basis_eval[ipt + 1*npts] = ang_eval_1;
       basis_eval[ipt + 2*npts] = ang_eval_2;
       basis_eval[ipt + 3*npts] = ang_eval_3;
 
-      ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8;
-      ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4;
-      ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4;
-      ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4;
+      ang_eval_0 = x29*x35;
+      ang_eval_1 = radial_eval*x28*x36;
+      ang_eval_2 = radial_eval*x37*x38;
+      ang_eval_3 = radial_eval*x39*x41;
       basis_eval[ipt + 4*npts] = ang_eval_0;
       basis_eval[ipt + 5*npts] = ang_eval_1;
       basis_eval[ipt + 6*npts] = ang_eval_2;
       basis_eval[ipt + 7*npts] = ang_eval_3;
 
-      ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      ang_eval_0 = sqrt_35*x29*x42;
       basis_eval[ipt + 8*npts] = ang_eval_0;
 
 
@@ -203,18 +385,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2;
       double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2;
-      dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2;
-      dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2;
-      dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4;
-      dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4;
-      dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4;
-      dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2;
-      dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2;
-      dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2;
-      dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
+      dang_eval_x_0 = x1*(radial_eval_alpha*x44 + x43);
+      dang_eval_y_0 = x70*(radial_eval_alpha*x72 + x71);
+      dang_eval_z_0 = x57*x8*x92;
+      dang_eval_x_1 = x45*x48;
+      dang_eval_y_1 = x10*x74;
+      dang_eval_z_1 = x93*(radial_eval_alpha*x94 + x43);
+      dang_eval_x_2 = x15*(radial_eval*x49 + radial_eval_alpha*x50);
+      dang_eval_y_2 = x75*(radial_eval*x77 + radial_eval_alpha*x78);
+      dang_eval_z_2 = x16*z*(radial_eval_alpha*x21 + x53);
+      dang_eval_x_3 = x52;
+      dang_eval_y_3 = x23*(radial_eval*x81 + radial_eval_alpha*x82);
+      dang_eval_z_3 = x100*x95;
       basis_x_eval[ipt + 0*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 0*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 0*npts] = dang_eval_z_0;
@@ -228,18 +410,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 3*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 3*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8;
-      dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4;
-      dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4;
-      dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4;
-      dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4;
-      dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4;
+      dang_eval_x_0 = 0.125*x35*x57 + 0.125*x53*x56;
+      dang_eval_y_0 = 0.125*x35*x86 + 0.125*x53*x85;
+      dang_eval_z_0 = 2.0*radial_eval*x102 + 0.125*x103*x35;
+      dang_eval_x_1 = x23*(radial_eval*x59 + radial_eval_alpha*x60);
+      dang_eval_y_1 = x52;
+      dang_eval_z_1 = x*x100*x22;
+      dang_eval_x_2 = x37*(x38*x57 + x61*x63);
+      dang_eval_y_2 = x37*(x38*x86 + x61*x88);
+      dang_eval_z_2 = x104*(radial_eval_alpha*x38 + x105);
+      dang_eval_x_3 = x10*x67;
+      dang_eval_y_3 = x45*x90;
+      dang_eval_z_3 = x*x9*(radial_eval_alpha*x106 + x71);
       basis_x_eval[ipt + 4*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 4*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 4*npts] = dang_eval_z_0;
@@ -253,9 +435,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel
       basis_y_eval[ipt + 7*npts] = dang_eval_y_3;
       basis_z_eval[ipt + 7*npts] = dang_eval_z_3;
 
-      dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
-      dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8;
-      dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8;
+      dang_eval_x_0 = x68*(x42*x57 + x61*x69);
+      dang_eval_y_0 = x68*(x42*x86 + x61*x91);
+      dang_eval_z_0 = x103*x42*x68;
       basis_x_eval[ipt + 8*npts] = dang_eval_x_0;
       basis_y_eval[ipt + 8*npts] = dang_eval_y_0;
       basis_z_eval[ipt + 8*npts] = dang_eval_z_0;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu
index 7f489cd1..d3380d60 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py
index 76a5d4b7..9bedd244 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py
@@ -3,6 +3,8 @@
 from collocation_angular import generate_spherical_angular, generate_cartesian_angular, generate_cartesian_ls, generate_eval_lines
 import sympy
 import itertools
+from sympy.printing import ccode
+from sympy.codegen.rewriting import create_expand_pow_optimization
 
 from io import StringIO
 
@@ -15,7 +17,7 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ):
   do_hess = bool(deriv_order > 1)
 
   [x,y,z,r] = sympy.symbols('x y z r', real=True)
-  [bf,bf_alpha,bf_alpha_sq] = sympy.symbols('radial_eval radial_eval_alpha radial_eval_alpha_squared',real=True)
+  [bf,bf_alpha,bf_alpha_sq,bf_alpha_cb] = sympy.symbols('radial_eval radial_eval_alpha radial_eval_alpha_squared radial_eval_alpha_cubed',real=True)
   bf_x = x * bf_alpha
   bf_y = y * bf_alpha
   bf_z = z * bf_alpha
@@ -28,6 +30,17 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ):
   bf_xz = x*z*bf_alpha_sq
   bf_yz = y*z*bf_alpha_sq
 
+  bf_xxx = (x + x + x)*bf_alpha_sq + x*x*x*bf_alpha_cb
+  bf_xxy = (y + 0 + 0)*bf_alpha_sq + x*x*y*bf_alpha_cb
+  bf_xxz = (z + 0 + 0)*bf_alpha_sq + x*x*z*bf_alpha_cb
+  bf_yyx = (x + 0 + 0)*bf_alpha_sq + y*y*x*bf_alpha_cb
+  bf_yyy = (y + y + y)*bf_alpha_sq + y*y*y*bf_alpha_cb
+  bf_yyz = (z + 0 + 0)*bf_alpha_sq + y*y*z*bf_alpha_cb
+  bf_zzx = (x + 0 + 0)*bf_alpha_sq + z*z*x*bf_alpha_cb
+  bf_zzy = (y + 0 + 0)*bf_alpha_sq + z*z*y*bf_alpha_cb
+  bf_zzz = (z + z + z)*bf_alpha_sq + z*z*z*bf_alpha_cb
+  
+
   bf_eval_strs = []
   bf_x_eval_strs = []
   bf_y_eval_strs = []
@@ -39,6 +52,9 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ):
   bf_yz_eval_strs = []
   bf_zz_eval_strs = []
   bf_lap_eval_strs = []
+  bf_lap_x_eval_strs = []
+  bf_lap_y_eval_strs = []
+  bf_lap_z_eval_strs = []
   for j in range(len(ang)):
     a       = ang[j]
     a_x = sympy.diff( a, x )
@@ -52,25 +68,46 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ):
     a_yz = sympy.diff( a_y, z )
     a_zz = sympy.diff( a_z, z )
 
-    bf_eval = sympy.simplify( a * bf )
-    bf_x_eval = sympy.simplify( a_x * bf + a * bf_x )
-    bf_y_eval = sympy.simplify( a_y * bf + a * bf_y )
-    bf_z_eval = sympy.simplify( a_z * bf + a * bf_z )
-
-    bf_xx_eval = sympy.simplify( a_xx * bf + 2 * a_x * bf_x + a * bf_xx )
-    bf_yy_eval = sympy.simplify( a_yy * bf + 2 * a_y * bf_y + a * bf_yy )
-    bf_zz_eval = sympy.simplify( a_zz * bf + 2 * a_z * bf_z + a * bf_zz )
+    a_xxx = sympy.diff( a_xx, x )
+    a_xxy = sympy.diff( a_xx, y )
+    a_xxz = sympy.diff( a_xx, z )
+    a_yyx = sympy.diff( a_yy, x )
+    a_yyy = sympy.diff( a_yy, y )
+    a_yyz = sympy.diff( a_yy, z )
+    a_zzx = sympy.diff( a_zz, x )
+    a_zzy = sympy.diff( a_zz, y )
+    a_zzz = sympy.diff( a_zz, z )
+
+    bf_eval = a * bf 
+    bf_x_eval = a_x * bf + a * bf_x 
+    bf_y_eval = a_y * bf + a * bf_y 
+    bf_z_eval = a_z * bf + a * bf_z 
+
+    bf_xx_eval = a_xx * bf + 2 * a_x * bf_x + a * bf_xx 
+    bf_yy_eval = a_yy * bf + 2 * a_y * bf_y + a * bf_yy 
+    bf_zz_eval = a_zz * bf + 2 * a_z * bf_z + a * bf_zz 
+
+    bf_lap_eval = bf_xx_eval + bf_yy_eval + bf_zz_eval
+
+    bf_xy_eval = a_xy * bf + a_x * bf_y + a_y * bf_x + a * bf_xy 
+    bf_xz_eval = a_xz * bf + a_x * bf_z + a_z * bf_x + a * bf_xz 
+    bf_yz_eval = a_yz * bf + a_y * bf_z + a_z * bf_y + a * bf_yz 
+
+    bf_xxx_eval = a_xxx * bf + 3 * (a_xx * bf_x +  a_x * bf_xx) + a * bf_xxx
+    bf_yyy_eval = a_yyy * bf + 3 * (a_yy * bf_y +  a_y * bf_yy) + a * bf_yyy
+    bf_zzz_eval = a_zzz * bf + 3 * (a_zz * bf_z +  a_z * bf_zz) + a * bf_zzz
+
+    bf_xxy_eval = a_xxy * bf + 2*a_xy*bf_x + a_xx*bf_y + 2*bf_xy*a_x + bf_xx*a_y + a* bf_xxy
+    bf_xxz_eval = a_xxz * bf + 2*a_xz*bf_x + a_xx*bf_z + 2*bf_xz*a_x + bf_xx*a_z + a* bf_xxz
+    bf_yyx_eval = a_yyx * bf + 2*a_xy*bf_y + a_yy*bf_x + 2*bf_xy*a_y + bf_yy*a_x + a* bf_yyx
+    bf_yyz_eval = a_yyz * bf + 2*a_yz*bf_y + a_yy*bf_z + 2*bf_yz*a_y + bf_yy*a_z + a* bf_yyz
+    bf_zzx_eval = a_zzx * bf + 2*a_xz*bf_z + a_zz*bf_x + 2*bf_xz*a_z + bf_zz*a_x + a* bf_zzx
+    bf_zzy_eval = a_zzy * bf + 2*a_yz*bf_z + a_zz*bf_y + 2*bf_yz*a_z + bf_zz*a_y + a* bf_zzy
+
+    bf_lap_x_eval = bf_xxx_eval + bf_yyx_eval + bf_zzx_eval
+    bf_lap_y_eval = bf_xxy_eval + bf_yyy_eval + bf_zzy_eval
+    bf_lap_z_eval = bf_xxz_eval + bf_yyz_eval + bf_zzz_eval
 
-    bf_lap_eval = sympy.simplify(bf_xx_eval + bf_yy_eval + bf_zz_eval)
-
-    bf_xy_eval = sympy.simplify( a_xy * bf + a_x * bf_y + a_y * bf_x + a * bf_xy )
-    bf_xz_eval = sympy.simplify( a_xz * bf + a_x * bf_z + a_z * bf_x + a * bf_xz )
-    bf_yz_eval = sympy.simplify( a_yz * bf + a_y * bf_z + a_z * bf_y + a * bf_yz )
-
-    #bf_eval_str = 'ang_eval = {};'.format(bf_eval)
-    #bf_x_eval_str = 'dang_eval_x = {};'.format(bf_x_eval)
-    #bf_y_eval_str = 'dang_eval_y = {};'.format(bf_y_eval)
-    #bf_z_eval_str = 'dang_eval_z = {};'.format(bf_z_eval)
     bf_eval_str   = '{}'.format(bf_eval  )
     bf_x_eval_str = '{}'.format(bf_x_eval)
     bf_y_eval_str = '{}'.format(bf_y_eval)
@@ -85,26 +122,10 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ):
 
     bf_lap_eval_str = '{}'.format(bf_lap_eval)
 
-    for k in range(2,L+3):
-      for X in ('x','y','z'):
-        pow_str = X + '**' + str(k)
-        repl_str = ''
-        for K in range(k-1): repl_str = repl_str + X + '*'
-        repl_str = repl_str + X
-    
-        bf_eval_str = bf_eval_str.replace(pow_str,repl_str)
-        bf_x_eval_str = bf_x_eval_str.replace(pow_str,repl_str)
-        bf_y_eval_str = bf_y_eval_str.replace(pow_str,repl_str)
-        bf_z_eval_str = bf_z_eval_str.replace(pow_str,repl_str)
-
-        bf_xx_eval_str = bf_xx_eval_str.replace(pow_str,repl_str)
-        bf_xy_eval_str = bf_xy_eval_str.replace(pow_str,repl_str)
-        bf_xz_eval_str = bf_xz_eval_str.replace(pow_str,repl_str)
-        bf_yy_eval_str = bf_yy_eval_str.replace(pow_str,repl_str)
-        bf_yz_eval_str = bf_yz_eval_str.replace(pow_str,repl_str)
-        bf_zz_eval_str = bf_zz_eval_str.replace(pow_str,repl_str)
-
-        bf_lap_eval_str = bf_lap_eval_str.replace(pow_str,repl_str)
+    bf_lap_x_eval_str = '{}'.format(bf_lap_x_eval) 
+    bf_lap_y_eval_str = '{}'.format(bf_lap_y_eval) 
+    bf_lap_z_eval_str = '{}'.format(bf_lap_z_eval) 
+
     bf_eval_strs.append( bf_eval_str )
     bf_x_eval_strs.append( bf_x_eval_str )
     bf_y_eval_strs.append( bf_y_eval_str )
@@ -118,11 +139,16 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ):
     bf_zz_eval_strs.append( bf_zz_eval_str )
 
     bf_lap_eval_strs.append( bf_lap_eval_str )
+    bf_lap_x_eval_strs.append( bf_lap_x_eval_str )
+    bf_lap_y_eval_strs.append( bf_lap_y_eval_str )
+    bf_lap_z_eval_strs.append( bf_lap_z_eval_str )
 
   if deriv_order == 0:   return bf_eval_strs
   elif deriv_order == 1: return [bf_x_eval_strs, bf_y_eval_strs, bf_z_eval_strs]
   elif deriv_order == 2:
     return [bf_xx_eval_strs, bf_xy_eval_strs, bf_xz_eval_strs, bf_yy_eval_strs, bf_yz_eval_strs, bf_zz_eval_strs, bf_lap_eval_strs]
+  elif deriv_order == 3:
+    return [bf_lap_x_eval_strs, bf_lap_y_eval_strs, bf_lap_z_eval_strs]
 
 
 
@@ -131,7 +157,7 @@ def get_constant_lines( lines ):
   constant_lines = []
 
   # Sqrts
-  sqrt_regex = "sqrt\([0-9]+\)"
+  sqrt_regex = 'sqrt\([0-9]+\)'
   sqrt_finds = list(set(re.findall( sqrt_regex, "\n".join(lines) )))
 
   # Replace locally
@@ -146,7 +172,7 @@ def get_constant_lines( lines ):
 
 def sanitize_constants( lines ):
   # Sqrts
-  sqrt_regex = "sqrt\([0-9]+\)"
+  sqrt_regex = 'sqrt\([0-9]+\)'
   sqrt_finds = list(set(re.findall( sqrt_regex, "\n".join(lines) )))
 
   for x in sqrt_finds:
@@ -174,6 +200,9 @@ def sanitize_constants( lines ):
 cart_bfyz_lines = []
 cart_bfzz_lines = []
 cart_bflap_lines = []
+cart_bflap_x_lines = []
+cart_bflap_y_lines = []
+cart_bflap_z_lines = []
 sph_bfxx_lines = []
 sph_bfxy_lines = []
 sph_bfxz_lines = []
@@ -181,6 +210,9 @@ def sanitize_constants( lines ):
 sph_bfyz_lines = []
 sph_bfzz_lines = []
 sph_bflap_lines = []
+sph_bflap_x_lines = []
+sph_bflap_y_lines = []
+sph_bflap_z_lines = []
 
 for L in range( L_max + 1 ):
   print("Workding on L = ", L)
@@ -219,6 +251,16 @@ def sanitize_constants( lines ):
   sph_bfzz_lines.append(bfzz)
   sph_bflap_lines.append(bflap)
 
+  [bflap_x, bflap_y, bflap_z] = generate_shell_to_task_lines(cart_ang,3)
+  cart_bflap_x_lines.append(bflap_x)
+  cart_bflap_y_lines.append(bflap_y)
+  cart_bflap_z_lines.append(bflap_z)
+
+  [bflap_x, bflap_y, bflap_z] = generate_shell_to_task_lines(sph_ang,3)
+  sph_bflap_x_lines.append(bflap_x)
+  sph_bflap_y_lines.append(bflap_y)
+  sph_bflap_z_lines.append(bflap_z)
+
 
 constant_lines = []
 for lines in itertools.chain( cart_bf_lines, sph_bf_lines ):
@@ -227,60 +269,49 @@ def sanitize_constants( lines ):
     constant_lines.append(line)
 
   
-# Sanitize wrt constants
-for i,lines in enumerate(cart_bf_lines):
-  cart_bf_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bf_lines):
-  sph_bf_lines[i] = sanitize_constants( lines )
-
-for i,lines in enumerate(cart_bfx_lines):
-  cart_bfx_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfy_lines):
-  cart_bfy_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfz_lines):
-  cart_bfz_lines[i] = sanitize_constants( lines )
-
-for i,lines in enumerate(sph_bfx_lines):
-  sph_bfx_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfy_lines):
-  sph_bfy_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfz_lines):
-  sph_bfz_lines[i] = sanitize_constants( lines )
-
-for i,lines in enumerate(cart_bfxx_lines):
-  cart_bfxx_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfxy_lines):
-  cart_bfxy_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfxz_lines):
-  cart_bfxz_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfyy_lines):
-  cart_bfyy_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfyz_lines):
-  cart_bfyz_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bfzz_lines):
-  cart_bfzz_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(cart_bflap_lines):
-  cart_bflap_lines[i] = sanitize_constants( lines )
-
-for i,lines in enumerate(sph_bfxx_lines):
-  sph_bfxx_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfxy_lines):
-  sph_bfxy_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfxz_lines):
-  sph_bfxz_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfyy_lines):
-  sph_bfyy_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfyz_lines):
-  sph_bfyz_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bfzz_lines):
-  sph_bfzz_lines[i] = sanitize_constants( lines )
-for i,lines in enumerate(sph_bflap_lines):
-  sph_bflap_lines[i] = sanitize_constants( lines )
 
 
+def perform_cse_and_cleanup(eval_line_list):
+  expand_opt = create_expand_pow_optimization(20)
+
+  for i in range(len(eval_line_list)):
+    if len(eval_line_list[0]) != len(eval_line_list[i]):
+      raise RuntimeError("Eval lines are not uniform length")
+
+  # Concatenate lists
+  prim_len = len(eval_line_list[0])
+  big_list = []
+  for i in range(len(eval_line_list)):
+    for x in eval_line_list[i]: big_list.append(x)
+
+  # Sanitize constants
+  big_list = sanitize_constants(big_list)
+
+  # Parse to SymPy expressions
+  big_list = [sympy.parse_expr(x) for x in big_list]
+
+  # Apply expand opt
+  big_list = [expand_opt(x) for x in big_list]
+  
+  # Perform CSE
+  (common_lines, big_list) = sympy.cse(big_list, optimizations='basic')
+
+  # Sanitize output lines
+  big_list = [ccode(expand_opt(sympy.simplify(x.evalf()))) for x in big_list]
+  common_lines = [ (x,ccode(expand_opt(sympy.simplify(y.evalf())))) for (x,y) in common_lines ]
+
+  # Split big list
+  for i in range(len(eval_line_list)):
+    eval_line_list[i] = big_list[i*prim_len:(i+1)*prim_len]
+
+  return (common_lines,eval_line_list)
+    
+
 def generate_code( eval_lines, L, eval_type, template_fname, output_fname ):
   old_sysout = sys.stdout
-  var_dict = { 'eval_lines' : eval_lines, 'L' : L, 'type' : eval_type }
+  common_lines, eval_lines = perform_cse_and_cleanup([eval_lines])
+  eval_lines = eval_lines[0]
+  var_dict = { 'common_lines': common_lines, 'eval_lines' : eval_lines, 'L' : L, 'type' : eval_type, 'nt' : 512 }
   sys.stdout = expand = StringIO()
   expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True )
   expand = expand.getvalue()
@@ -291,11 +322,20 @@ def generate_code( eval_lines, L, eval_type, template_fname, output_fname ):
 
 def generate_code_gradient( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, L, eval_type, template_fname, output_fname ):
   old_sysout = sys.stdout
-  var_dict = { 'eval_lines' : eval_lines,
+
+  common_lines, big_list = perform_cse_and_cleanup([eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz])
+  eval_lines    = big_list[0]
+  eval_lines_dx = big_list[1]
+  eval_lines_dy = big_list[2]
+  eval_lines_dz = big_list[3]
+
+  var_dict = { 'common_lines': common_lines, 
+               'eval_lines' : eval_lines,
                'eval_lines_dx' : eval_lines_dx, 
                'eval_lines_dy' : eval_lines_dy, 
                'eval_lines_dz' : eval_lines_dz, 
-               'L' : L, 'type' : eval_type }
+               'L' : L, 'type' : eval_type,
+               'nt' : 512 if L < 1 else 256 if L < 4 else 128}
   sys.stdout = expand = StringIO()
   expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True )
   expand = expand.getvalue()
@@ -306,7 +346,65 @@ def generate_code_gradient( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines
 
 def generate_code_hessian( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap, L, eval_type, template_fname, output_fname ):
   old_sysout = sys.stdout
-  var_dict = { 'eval_lines' : eval_lines,
+  big_list = [eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap]
+  common_lines, big_list = perform_cse_and_cleanup(big_list)
+  eval_lines     = big_list[0]
+  eval_lines_dx  = big_list[1]
+  eval_lines_dy  = big_list[2]
+  eval_lines_dz  = big_list[3]
+  eval_lines_dxx = big_list[4]
+  eval_lines_dxy = big_list[5]
+  eval_lines_dxz = big_list[6]
+  eval_lines_dyy = big_list[7]
+  eval_lines_dyz = big_list[8]
+  eval_lines_dzz = big_list[9]
+  eval_lines_lap = big_list[10]
+
+  var_dict = { 'common_lines' : common_lines,
+               'eval_lines' : eval_lines,
+               'eval_lines_dx' : eval_lines_dx, 
+               'eval_lines_dy' : eval_lines_dy, 
+               'eval_lines_dz' : eval_lines_dz, 
+               'eval_lines_dxx' : eval_lines_dxx, 
+               'eval_lines_dxy' : eval_lines_dxy, 
+               'eval_lines_dxz' : eval_lines_dxz, 
+               'eval_lines_dyy' : eval_lines_dyy, 
+               'eval_lines_dyz' : eval_lines_dyz, 
+               'eval_lines_dzz' : eval_lines_dzz, 
+               'eval_lines_lapl' : eval_lines_lap,
+               'L' : L, 'type' : eval_type,
+               'nt' : 256 if L < 1 else 128 }
+  sys.stdout = expand = StringIO()
+  expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True )
+  expand = expand.getvalue()
+  sys.stdout = old_sysout
+
+  output_file = open(output_fname, 'w')
+  output_file.write(expand)
+
+
+
+def generate_code_lapgrad( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap, eval_lines_lapx, eval_lines_lapy, eval_lines_lapz, L, eval_type, template_fname, output_fname ):
+  old_sysout = sys.stdout
+  big_list = [eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap, eval_lines_lapx, eval_lines_lapy, eval_lines_lapz]
+  common_lines, big_list = perform_cse_and_cleanup(big_list)
+  eval_lines     = big_list[0]
+  eval_lines_dx  = big_list[1]
+  eval_lines_dy  = big_list[2]
+  eval_lines_dz  = big_list[3]
+  eval_lines_dxx = big_list[4]
+  eval_lines_dxy = big_list[5]
+  eval_lines_dxz = big_list[6]
+  eval_lines_dyy = big_list[7]
+  eval_lines_dyz = big_list[8]
+  eval_lines_dzz = big_list[9]
+  eval_lines_lap = big_list[10]
+  eval_lines_lapx = big_list[11]
+  eval_lines_lapy = big_list[12]
+  eval_lines_lapz = big_list[13]
+
+  var_dict = { 'common_lines' : common_lines,
+               'eval_lines' : eval_lines,
                'eval_lines_dx' : eval_lines_dx, 
                'eval_lines_dy' : eval_lines_dy, 
                'eval_lines_dz' : eval_lines_dz, 
@@ -317,7 +415,11 @@ def generate_code_hessian( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_
                'eval_lines_dyz' : eval_lines_dyz, 
                'eval_lines_dzz' : eval_lines_dzz, 
                'eval_lines_lapl' : eval_lines_lap,
-               'L' : L, 'type' : eval_type }
+               'eval_lines_lapl_x' : eval_lines_lapx,
+               'eval_lines_lapl_y' : eval_lines_lapy,
+               'eval_lines_lapl_z' : eval_lines_lapz,
+               'L' : L, 'type' : eval_type,
+               'nt' : 256 if L < 1 else 128 }
   sys.stdout = expand = StringIO()
   expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True )
   expand = expand.getvalue()
@@ -362,6 +464,17 @@ def generate_code_hessian( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_
     sph_bfxx_lines[L], sph_bfxy_lines[L], sph_bfxz_lines[L], sph_bfyy_lines[L], sph_bfyz_lines[L],
     sph_bfzz_lines[L], sph_bflap_lines[L], L, 'spherical_laplacian', template_fname, sph_header_fname )
 
+  cart_header_fname = "collocation_shell_to_task_kernels_cartesian_l" + str(L) + "_lapgrad.hpp"
+  sph_header_fname  = "collocation_shell_to_task_kernels_spherical_l" + str(L) + "_lapgrad.hpp"
+  generate_code_lapgrad( cart_bf_lines[L], cart_bfx_lines[L], cart_bfy_lines[L], cart_bfz_lines[L],
+    cart_bfxx_lines[L], cart_bfxy_lines[L], cart_bfxz_lines[L], cart_bfyy_lines[L], cart_bfyz_lines[L],
+    cart_bfzz_lines[L], cart_bflap_lines[L], cart_bflap_x_lines[L], cart_bflap_y_lines[L], cart_bflap_z_lines[L], 
+    L, 'cartesian_lapgrad', template_fname, cart_header_fname )
+  generate_code_lapgrad( sph_bf_lines[L], sph_bfx_lines[L], sph_bfy_lines[L], sph_bfz_lines[L],
+    sph_bfxx_lines[L], sph_bfxy_lines[L], sph_bfxz_lines[L], sph_bfyy_lines[L], sph_bfyz_lines[L],
+    sph_bfzz_lines[L], sph_bflap_lines[L], sph_bflap_x_lines[L], sph_bflap_y_lines[L], sph_bflap_z_lines[L], 
+    L, 'spherical_lapgrad', template_fname, sph_header_fname )
+
 
   #template_fname = 'templates/collocation_shell_to_task_combined_kernels.hpp'
   #cart_header_fname = "collocation_shell_to_task_combined_kernels_cartesian_l" + str(L) + ".hpp"
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp
index e92ec0b0..0816560a 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp
index 3a62fef3..f76c6863 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu
index 62557401..f28cadee 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -348,7 +352,7 @@ template <typename... Args>
 void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l, 
   bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
 
-  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  dim3 threads = max_threads_shell_to_task_collocation_gradient(l,pure);
   int nwarp_per_block = threads.x / cuda::warp_size;
   int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
   dim3 block(n_task_blocks, 1, nshells);
@@ -425,7 +429,7 @@ template <typename... Args>
 void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, 
   bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
 
-  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  dim3 threads = max_threads_shell_to_task_collocation_hessian(l,pure);
   int nwarp_per_block = threads.x / cuda::warp_size;
   int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
   dim3 block(n_task_blocks, 1, nshells);
@@ -506,7 +510,7 @@ template <typename... Args>
 void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l,
   bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
 
-  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  dim3 threads = max_threads_shell_to_task_collocation_laplacian(l,pure);
   int nwarp_per_block = threads.x / cuda::warp_size;
   int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
   dim3 block(n_task_blocks, 1, nshells);
@@ -561,6 +565,89 @@ void eval_collocation_shell_to_task_laplacian(
 
 }
 
+uint32_t max_threads_shell_to_task_collocation_lapgrad( int32_t l, bool pure ) {
+  if( pure ) {
+    switch(l) {
+      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_0 );\
+      $for( L in range(1, L_max + 1) )
+      case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_$(L) );
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  } else {
+    switch(l) {\
+      $for( L in range(L_max + 1) )
+      case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_$(L) );\
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  }
+  return 0;
+}
+
+
+
+
+
+template <typename... Args>
+void dispatch_shell_to_task_collocation_lapgrad( cudaStream_t stream, int32_t l,
+  bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
+
+  dim3 threads = max_threads_shell_to_task_collocation_lapgrad(l,pure);
+  int nwarp_per_block = threads.x / cuda::warp_size;
+  int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
+  dim3 block(n_task_blocks, 1, nshells);
+
+  if( pure ) {
+    switch(l) {
+      case 0:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;
+      $for( L in range(1, L_max + 1) )
+      case $(L):
+        collocation_device_shell_to_task_kernel_spherical_lapgrad_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;\
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  } else {
+    switch(l) {\
+      $for( L in range(0, L_max + 1) )
+      case $(L):
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;\
+      $endfor
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
+    }
+  }
+
+}
+
+
+
+void eval_collocation_shell_to_task_lapgrad(
+  uint32_t                    max_l,
+  AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
+  XCDeviceTask*               device_tasks,
+  device_queue           queue
+) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
+
+  for( auto l = 0u; l <= max_l; ++l ) {
+    auto pure = l_batched_shell_to_task[l].pure;
+    auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device;
+    auto nshells = l_batched_shell_to_task[l].nshells_in_batch;
+    auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average);
+    dispatch_shell_to_task_collocation_lapgrad( stream, l, pure,
+      ntask_average, nshells, shell_to_task_device, device_tasks );
+    auto stat = cudaGetLastError();
+    GAUXC_CUDA_ERROR("LAP", stat);
+  }
+
+
+}
+
 
 
 } // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp
index 4b223611..7cc19871 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -14,10 +18,10 @@
 
 namespace GauXC {
 
-$py(do_grad = 'gradient' in type or 'hessian' in type or 'lapl' in type)\
-$py(do_hess = 'hessian' in type)\
-$py(do_lapl = 'lapl' in type)\
-$py(nt = 512)\
+$py(do_grad = 'gradient' in type or 'hessian' in type or 'lapl' in type or 'lapgrad' in type)\
+$py(do_hess = 'hessian' in type or 'lapgrad' in type)\
+$py(do_lapl = 'lapl' in type or 'lapgrad' in type)\
+$py(do_lapl_grad = 'lapgrad' in type)\
 
 __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kernel_$(type)_$(L)(
   uint32_t                        nshell,
@@ -72,7 +76,6 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern
     auto* __restrict__ basis_y_eval = task->dbfy + shoff;
     auto* __restrict__ basis_z_eval = task->dbfz + shoff;
 $endif\
-
 $if( do_hess )\
     auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
     auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
@@ -84,6 +87,11 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern
 $if( do_lapl )\
     auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff;
 $endif\
+$if( do_lapl_grad )\
+    auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff;
+    auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff;
+    auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff;
+$endif\
 
     // Loop over points in task
     // Assign each point to separate thread within the warp
@@ -109,6 +117,9 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern
 $if( do_hess or do_lapl)\
       double radial_eval_alpha_squared = 0.;
 $endif\
+$if( do_lapl_grad)\
+      double radial_eval_alpha_cubed = 0.;
+$endif\
 
       #pragma unroll 1
       for( uint32_t i = 0; i < nprim; ++i ) {
@@ -121,6 +132,9 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern
 $endif\
 $if( do_hess or do_lapl)\
         radial_eval_alpha_squared += a * a * e;
+$endif\
+$if( do_lapl_grad)\
+        radial_eval_alpha_cubed += a * a * a * e;
 $endif\
       }
 
@@ -130,8 +144,14 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern
 $if( do_hess or do_lapl)\
       radial_eval_alpha_squared *= 4;
 $endif\
+$if( do_lapl_grad )\
+      radial_eval_alpha_cubed *= -8;
+$endif\
 
-      
+      // Common Subexpressions
+$for( i in range(len(common_lines)) )\
+      const auto $(common_lines[i][0]) = $(common_lines[i][1]); 
+$endfor
 
       // Evaluate basis function
 $for( j in range(len(eval_lines)) )\
@@ -187,13 +207,29 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern
       basis_zz_eval[ipt + $(j)*npts] = $(eval_lines_dzz[j]);
 $endfor\
 $endif\
+
 $if(do_lapl)\
       // Evaluate Laplacian of bfn 
-$for( j in range(len(eval_lines_dx)) )\
+$for( j in range(len(eval_lines_lapl)) )\
       basis_lapl_eval[ipt + $(j)*npts] = $(eval_lines_lapl[j]);
 $endfor\
 $endif\
 
+$if(do_lapl_grad)\
+      // Evaluate Laplacian gradient of bfn (dx)
+$for( j in range(len(eval_lines_lapl_x)) )\
+      basis_lapl_x_eval[ipt + $(j)*npts] = $(eval_lines_lapl_x[j]);
+$endfor\
+      // Evaluate Laplacian gradient of bfn (dy)
+$for( j in range(len(eval_lines_lapl_y)) )\
+      basis_lapl_y_eval[ipt + $(j)*npts] = $(eval_lines_lapl_y[j]);
+$endfor\
+      // Evaluate Laplacian gradient of bfn (dz)
+$for( j in range(len(eval_lines_lapl_z)) )\
+      basis_lapl_z_eval[ipt + $(j)*npts] = $(eval_lines_lapl_z[j]);
+$endfor\
+$endif\
+
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp
index 544554b3..a699d9e6 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -23,6 +27,10 @@
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_laplacian.hpp"\
 $endfor
 
+$for( L in range(L_max + 1))
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_lapgrad.hpp"\
+$endfor
+
 $for( L in range(L_max + 1))
 #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L).hpp"\
 $endfor
@@ -38,3 +46,7 @@
 $for( L in range(L_max + 1))
 #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_laplacian.hpp"\
 $endfor
+
+$for( L in range(L_max + 1))
+#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_lapgrad.hpp"\
+$endfor
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp
index 4cb41a3b..abb281f4 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu
index 5ed615fb..d01b4d8b 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -372,7 +376,7 @@ template <typename... Args>
 void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l, 
   bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
 
-  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  dim3 threads = max_threads_shell_to_task_collocation_gradient(l,pure);
   int nwarp_per_block = threads.x / cuda::warp_size;
   int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
   dim3 block(n_task_blocks, 1, nshells);
@@ -469,7 +473,7 @@ template <typename... Args>
 void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, 
   bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
 
-  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  dim3 threads = max_threads_shell_to_task_collocation_hessian(l,pure);
   int nwarp_per_block = threads.x / cuda::warp_size;
   int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
   dim3 block(n_task_blocks, 1, nshells);
@@ -539,39 +543,42 @@ void eval_collocation_shell_to_task_hessian(
 }
 
 
-
-
-
-
-
 uint32_t max_threads_shell_to_task_collocation_laplacian( int32_t l, bool pure ) {
   if( pure ) {
     switch(l) {
-      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );
+      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );      
       case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_1 );
+      
       case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_2 );
+      
       case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_3 );
+      
       case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_4 );
+      
       default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
     }
   } else {
-    switch(l) {
-      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );
-      case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_1 );
-      case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_2 );
-      case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_3 );
-      case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_4 );
+    switch(l) {      
+      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );      
+      case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_1 );      
+      case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_2 );      
+      case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_3 );      
+      case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_4 );      
       default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
     }
   }
   return 0;
 }
 
+
+
+
+
 template <typename... Args>
-void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l, 
+void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l,
   bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
 
-  dim3 threads = max_threads_shell_to_task_collocation(l,pure);
+  dim3 threads = max_threads_shell_to_task_collocation_laplacian(l,pure);
   int nwarp_per_block = threads.x / cuda::warp_size;
   int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
   dim3 block(n_task_blocks, 1, nshells);
@@ -581,37 +588,38 @@ void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t
       case 0:
         collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
         break;
+      
       case 1:
         collocation_device_shell_to_task_kernel_spherical_laplacian_1<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 2:
         collocation_device_shell_to_task_kernel_spherical_laplacian_2<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 3:
         collocation_device_shell_to_task_kernel_spherical_laplacian_3<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 4:
         collocation_device_shell_to_task_kernel_spherical_laplacian_4<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
     }
   } else {
-    switch(l) {
+    switch(l) {      
       case 0:
         collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 1:
         collocation_device_shell_to_task_kernel_cartesian_laplacian_1<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 2:
         collocation_device_shell_to_task_kernel_cartesian_laplacian_2<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 3:
         collocation_device_shell_to_task_kernel_cartesian_laplacian_3<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       case 4:
         collocation_device_shell_to_task_kernel_cartesian_laplacian_4<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
-        break;
+        break;      
       default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
     }
   }
@@ -619,11 +627,12 @@ void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t
 }
 
 
+
 void eval_collocation_shell_to_task_laplacian(
   uint32_t                    max_l,
   AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
   XCDeviceTask*               device_tasks,
-  device_queue           queue 
+  device_queue           queue
 ) {
 
   cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
@@ -633,7 +642,7 @@ void eval_collocation_shell_to_task_laplacian(
     auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device;
     auto nshells = l_batched_shell_to_task[l].nshells_in_batch;
     auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average);
-    dispatch_shell_to_task_collocation_laplacian( stream, l, pure, 
+    dispatch_shell_to_task_collocation_laplacian( stream, l, pure,
       ntask_average, nshells, shell_to_task_device, device_tasks );
     auto stat = cudaGetLastError();
     GAUXC_CUDA_ERROR("LAP", stat);
@@ -642,6 +651,113 @@ void eval_collocation_shell_to_task_laplacian(
 
 }
 
+uint32_t max_threads_shell_to_task_collocation_lapgrad( int32_t l, bool pure ) {
+  if( pure ) {
+    switch(l) {
+      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_0 );      
+      case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_1 );
+      
+      case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_2 );
+      
+      case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_3 );
+      
+      case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_4 );
+      
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
+    }
+  } else {
+    switch(l) {      
+      case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_0 );      
+      case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_1 );      
+      case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_2 );      
+      case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_3 );      
+      case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_4 );      
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
+    }
+  }
+  return 0;
+}
+
+
+
+
+
+template <typename... Args>
+void dispatch_shell_to_task_collocation_lapgrad( cudaStream_t stream, int32_t l,
+  bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {
+
+  dim3 threads = max_threads_shell_to_task_collocation_lapgrad(l,pure);
+  int nwarp_per_block = threads.x / cuda::warp_size;
+  int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
+  dim3 block(n_task_blocks, 1, nshells);
+
+  if( pure ) {
+    switch(l) {
+      case 0:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;
+      
+      case 1:
+        collocation_device_shell_to_task_kernel_spherical_lapgrad_1<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 2:
+        collocation_device_shell_to_task_kernel_spherical_lapgrad_2<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 3:
+        collocation_device_shell_to_task_kernel_spherical_lapgrad_3<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 4:
+        collocation_device_shell_to_task_kernel_spherical_lapgrad_4<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
+    }
+  } else {
+    switch(l) {      
+      case 0:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 1:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_1<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 2:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_2<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 3:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_3<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      case 4:
+        collocation_device_shell_to_task_kernel_cartesian_lapgrad_4<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
+        break;      
+      default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4");
+    }
+  }
+
+}
+
+
+
+void eval_collocation_shell_to_task_lapgrad(
+  uint32_t                    max_l,
+  AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
+  XCDeviceTask*               device_tasks,
+  device_queue           queue
+) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
+
+  for( auto l = 0u; l <= max_l; ++l ) {
+    auto pure = l_batched_shell_to_task[l].pure;
+    auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device;
+    auto nshells = l_batched_shell_to_task[l].nshells_in_batch;
+    auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average);
+    dispatch_shell_to_task_collocation_lapgrad( stream, l, pure,
+      ntask_average, nshells, shell_to_task_device, device_tasks );
+    auto stat = cudaGetLastError();
+    GAUXC_CUDA_ERROR("LAPGRAD", stat);
+  }
+
+
+}
 
 
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp
index b401c126..dcc42625 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp
index 2a17c7d2..ecda9d2b 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp
index a5a725a8..e18494b8 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -28,6 +32,7 @@
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp"
 
+
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp"
@@ -35,6 +40,13 @@
 #include "collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp"
 
 
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp"
+
+
 #include "collocation/collocation_shell_to_task_kernels_spherical_l0.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l1.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l2.hpp"
@@ -55,8 +67,16 @@
 #include "collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp"
 
+
 #include "collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp"
 #include "collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp"
+
+
+#include "collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp"
+#include "collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp"
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
index ee7c7746..947d7b18 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -45,6 +49,24 @@ void increment( const T* X, T* Y, cudaStream_t stream ) {
   increment_kernel<<<1,1,0,stream>>>(X,Y);
 }
 
+template <typename T>
+__global__ void increment_vec_kernel( const T* X, T* Y, int N ) {
+  const auto tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if( tid < N ) Y[tid] += X[tid];
+}
+
+template <typename T>
+void increment( device_blas_handle generic_handle, const T* X, T* Y, int N) {
+  const int threads = cuda::warp_size * cuda::max_warps_per_thread_block;
+  const int blocks = util::div_ceil( N, threads );
+  cublasHandle_t handle = generic_handle.blas_handle_as<util::cublas_handle>();
+  auto stream = util::get_stream(handle);
+  increment_vec_kernel<<<blocks, threads, 0, stream>>>(X,Y,N);
+}
+
+template
+  void increment( device_blas_handle generic_handle, const double* X, double* Y, int N );
+
 template <>
 void dot( device_blas_handle generic_handle,
           int            N,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp
index fdbb56bd..8f5d0560 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu
index d54dda73..5e59ffcf 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu
index 6c538eba..54d2486e 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -33,22 +37,23 @@ __global__ void modify_weights_ssf_kernel_1d(
   // Frisch partition functions
   auto gFrisch = [](double x) {
 
-    const double s_x  = x / integrator::magic_ssf_factor<>;
+    const double s_x  = x  * 1.5625; // / integrator::magic_ssf_factor<>;
     const double s_x2 = s_x  * s_x;
     const double s_x3 = s_x  * s_x2;
     const double s_x5 = s_x3 * s_x2;
     const double s_x7 = s_x5 * s_x2;
 
-    return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.;
+    //return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.;
+    return ((35.)*(s_x - s_x3) + (21.)*s_x5 - (5.)*s_x7);
   };
   
   auto sFrisch = [&] (double x) {
-    if( fabs(x) < integrator::magic_ssf_factor<> ) return 0.5 * (1. - gFrisch(x));
+    if( fabs(x) < integrator::magic_ssf_factor<> ) return (0.5 - (0.5/16.) * gFrisch(x));
     else if( x >= integrator::magic_ssf_factor<> ) return 0.;
     else                               return 1.;
   };
 
-  constexpr double weight_tol = 1e-10;
+  constexpr double weight_tol = integrator::ssf_weight_tol;
 
   const int tid_x = threadIdx.x + blockIdx.x * blockDim.x;
   const int nt_x  = blockDim.x  * gridDim.x;
@@ -100,7 +105,7 @@ __global__ void modify_weights_ssf_kernel_1d(
 
       const double ri = local_dist_scratch[ iCenter ];
 
-      const double* const local_rab = RAB + iCenter * natoms;
+      const double* const local_rab = RAB + iCenter * ldRAB;
 
       double ps = 1.;
       for( int jCenter = 0; jCenter < natoms; jCenter++ ) 
@@ -138,4 +143,227 @@ void partition_weights_ssf_1d( int32_t npts, int32_t natoms, const double* RAB,
 
 }
 
+__global__ void eval_weight_1st_deriv_contracted_ssf_kernel_1d(
+        size_t                            npts,
+        size_t                            natoms,
+  const double*                           RAB,
+        int32_t                           ldRAB,
+  const double*                           coords,
+  const double*                           points_x,
+  const double*                           points_y,
+  const double*                           points_z,
+  const double*                           dist_scratch,
+        int32_t                           lddist,
+  const int32_t*                          iparent_device,
+  const double*                           dist_nearest_device,
+  const double*       __restrict__        w_times_f_device,
+        double*       __restrict__        exc_grad_w_device
+) {
+
+  // Frisch partition functions
+  auto gFrisch = [](double x) {
+
+    const double s_x  = x  * 1.5625; // / integrator::magic_ssf_factor<>;
+    const double s_x2 = s_x  * s_x;
+    const double s_x3 = s_x  * s_x2;
+    const double s_x5 = s_x3 * s_x2;
+    const double s_x7 = s_x5 * s_x2;
+
+    return ((35.)*(s_x - s_x3) + (21.)*s_x5 - (5.)*s_x7);
+  };
+  
+  auto sFrisch = [&] (double x) {
+    if( fabs(x) < integrator::magic_ssf_factor<> ) return (0.5 - (0.5/16.) * gFrisch(x));
+    else if( x >= integrator::magic_ssf_factor<> ) return 0.;
+    else                               return 1.;
+  };
+  
+  auto tFrisch = [&](double x) {
+    const double s_x  = x * 1.5625; // / integrator::magic_ssf_factor<>;
+    const double s_x2 = s_x  * s_x;
+    const double s_x3 = s_x  * s_x2;
+    const double numerator = (35.) * (s_x3 + (3.) * s_x2 + (3.) * s_x + (1.));
+    const double denominator = (x - integrator::magic_ssf_factor<>) * ((5.)*s_x3 + (20.)*s_x2 + (29.)*s_x + (16.));
+    return numerator / denominator ;
+  };
+
+  constexpr double safe_magic_ssf_bound = integrator::magic_ssf_factor<> - 1e-4;
+  constexpr double weight_tol = integrator::ssf_weight_tol;
+  constexpr double w_times_f_thresh = 1.e-12;
+
+  const int tid_x = threadIdx.x + blockIdx.x * blockDim.x;
+  const int nt_x  = blockDim.x  * gridDim.x;
+
+  for( int ipt = tid_x; ipt < npts; ipt += nt_x ) {
+
+    const auto w_times_f_i = w_times_f_device[ipt];
+    if (fabs(w_times_f_i) < w_times_f_thresh) continue; // weight derivative = 0 when p_A = 0
+    const auto iParent = iparent_device[ipt];
+
+    double sum = 0.; 
+    double parent_weight = 0.;
+
+    const double* const local_dist_scratch = dist_scratch + ipt * lddist;
+    const double dist_cutoff = 0.18 * dist_nearest_device[ipt]; // 0.5 * (1-integrator::magic_ssf_factor<>) * task.dist_nearest
+    if( local_dist_scratch[iParent] < dist_cutoff ) continue; //weight derivative = 0 when p_A = 1
+
+    // Do iParent First
+    {
+      const double ri = local_dist_scratch[ iParent ];
+      const double* const local_rab = RAB + iParent * ldRAB;
+
+      parent_weight = 1.;
+      for( int jCenter = 0; jCenter < natoms; jCenter++ ) 
+      if( parent_weight > weight_tol ) {
+      if( iParent != jCenter ) {
+      
+        const double rj = local_dist_scratch[ jCenter ];
+
+        const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric
+        parent_weight *= sFrisch( mu );
+
+      }
+      } else break;
+
+      sum += parent_weight;
+    }
+
+    // caculate sum
+    for( int iCenter = 0; iCenter < natoms; iCenter++ ) 
+    if ( iParent != iCenter ) {
+      const double ri = local_dist_scratch[ iCenter ];
+      const double* const local_rab = RAB + iCenter * ldRAB;
+      double ps = 1.;
+      for( int jCenter = 0; jCenter < natoms; jCenter++ ) 
+      if( ps > weight_tol ) {
+        if( iCenter != jCenter ) {
+        
+          const double rj = local_dist_scratch[ jCenter ];
+          const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric
+          ps *= sFrisch( mu );
+        }
+      } else break;
+
+      sum += ps;
+
+    }
+
+    double sum_inv = 1. / sum;
+
+    const double point_x = points_x[ipt];
+    const double point_y = points_y[ipt];
+    const double point_z = points_z[ipt];
+
+    // Now do derivative
+    for( int iB = 0; iB < natoms; iB++ ) 
+    if( iParent != iB ) 
+    {
+      double exc_grad_w_iBx = 0.0, exc_grad_w_iBy = 0.0, exc_grad_w_iBz = 0.0;
+
+      const double* const local_Rinv_B = RAB + iB * ldRAB;
+      const double rB = local_dist_scratch[ iB ];
+      const double coords_B_x = coords[3*iB + 0];
+      const double coords_B_y = coords[3*iB + 1];
+      const double coords_B_z = coords[3*iB + 2];
+
+      // first term
+      const double rA = local_dist_scratch[ iParent ];
+      const double rAB_inv = local_Rinv_B[ iParent ];
+      const double mu_AB = (rA - rB) * rAB_inv; 
+      if( fabs(mu_AB) < safe_magic_ssf_bound) {
+        // first term is tFrisch(mu_AB) * (PA-Z)/Z * w_times_f_i * nabla_B mu_BA 
+        double coef1 = tFrisch(mu_AB) * rAB_inv * (parent_weight - sum) * sum_inv * w_times_f_i / rB;
+        exc_grad_w_iBx = coef1 * (coords_B_x - point_x + mu_AB * ( coords_B_x - coords[3*iParent + 0]) * rAB_inv * rB);
+        exc_grad_w_iBy = coef1 * (coords_B_y - point_y + mu_AB * ( coords_B_y - coords[3*iParent + 1]) * rAB_inv * rB);
+        exc_grad_w_iBz = coef1 * (coords_B_z - point_z + mu_AB * ( coords_B_z - coords[3*iParent + 2]) * rAB_inv * rB);
+      }
+
+      // second term and third term
+      // first need to calculate PB
+      double PB = 1.;
+      for( int jCenter = 0; jCenter < natoms; jCenter++ )
+      if( PB > weight_tol ) {
+        if( iB != jCenter ) {
+          const double rj = local_dist_scratch[ jCenter ];
+          const double mu = (rB - rj) * local_Rinv_B[ jCenter ]; 
+          PB *= sFrisch( mu );
+        }
+      } else break;
+
+      if( PB >  weight_tol ) 
+        for( int iC = 0; iC < natoms; iC++ ) {
+          if (iB == iC) continue;
+          const double rBC_inv = local_Rinv_B[iC];
+          const double rC = local_dist_scratch[iC];
+          const double mu_BC = (rB - rC) * rBC_inv;
+          
+          if(fabs(mu_BC) < safe_magic_ssf_bound){
+            const double t_BC = tFrisch(mu_BC);
+            const double coef = PB * t_BC * rBC_inv * sum_inv * w_times_f_i;
+
+            const double coords_C_x = coords[3*iC + 0];
+            const double coords_C_y = coords[3*iC + 1];
+            const double coords_C_z = coords[3*iC + 2];
+
+            // second term
+            {
+              const double rB_inv = 1. / rB;
+              exc_grad_w_iBx -= coef * ((coords_B_x - point_x) * rB_inv - mu_BC * (coords_B_x - coords_C_x) * rBC_inv);
+              exc_grad_w_iBy -= coef * ((coords_B_y - point_y) * rB_inv - mu_BC * (coords_B_y - coords_C_y) * rBC_inv);
+              exc_grad_w_iBz -= coef * ((coords_B_z - point_z) * rB_inv - mu_BC * (coords_B_z - coords_C_z) * rBC_inv);
+            }
+
+            if(iC != iParent) {
+              // third term
+              const double rC_inv = 1. / rC;
+              const double C_x = coef * ((coords_C_x - point_x) * rC_inv + mu_BC * (coords_C_x - coords_B_x) * rBC_inv);
+              const double C_y = coef * ((coords_C_y - point_y) * rC_inv + mu_BC * (coords_C_y - coords_B_y) * rBC_inv);
+              const double C_z = coef * ((coords_C_z - point_z) * rC_inv + mu_BC * (coords_C_z - coords_B_z) * rBC_inv);
+              
+              atomicAdd(exc_grad_w_device + 3*iC + 0, C_x);
+              atomicAdd(exc_grad_w_device + 3*iC + 1, C_y);
+              atomicAdd(exc_grad_w_device + 3*iC + 2, C_z);
+
+              // Update parent atom
+              atomicAdd(exc_grad_w_device + 3*iParent + 0, -C_x);
+              atomicAdd(exc_grad_w_device + 3*iParent + 1, -C_y);
+              atomicAdd(exc_grad_w_device + 3*iParent + 2, -C_z);
+            }
+          }
+        }
+
+        atomicAdd(exc_grad_w_device + 3*iB + 0, exc_grad_w_iBx);
+        atomicAdd(exc_grad_w_device + 3*iB + 1, exc_grad_w_iBy);
+        atomicAdd(exc_grad_w_device + 3*iB + 2, exc_grad_w_iBz);
+
+        // Update parent atom
+        atomicAdd(exc_grad_w_device + 3*iParent + 0, -exc_grad_w_iBx);
+        atomicAdd(exc_grad_w_device + 3*iParent + 1, -exc_grad_w_iBy);
+        atomicAdd(exc_grad_w_device + 3*iParent + 2, -exc_grad_w_iBz);
+
+    }
+
+  }
+
+}
+
+
+
+void eval_weight_1st_deriv_contracted_ssf_1d( int32_t npts, int32_t natoms, const double* RAB,
+  int32_t ldRAB, const double* coords, 
+  const double* points_x, const double* points_y, const double* points_z,
+  const double* dist, int32_t lddist,
+  const int32_t* iparent, const double* dist_nearest, const double* w_times_f,
+  double* exc_grad_w, cudaStream_t stream){
+
+  dim3 threads( cuda::max_threads_per_thread_block/4 );
+  dim3 blocks ( util::div_ceil( npts, threads.x ) );
+  eval_weight_1st_deriv_contracted_ssf_kernel_1d<<<blocks, threads, 0, stream>>>(
+    npts, natoms, RAB, ldRAB, coords, points_x, points_y, points_z, dist, lddist, iparent, dist_nearest,
+    w_times_f, exc_grad_w
+  );
+
+}
+
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp
index d7295527..bb9d3b74 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -13,4 +17,11 @@ void partition_weights_ssf_1d( int32_t npts, int32_t natoms, const double* RAB,
   const int32_t* iparent, const double* dist_nearest, double* weights,
   cudaStream_t stream);
 
+void eval_weight_1st_deriv_contracted_ssf_1d( int32_t npts, int32_t natoms, const double* RAB,
+  int32_t ldRAB, const double* coords, 
+  const double* points_x, const double* points_y, const double* points_z,
+  const double* dist, int32_t lddist,
+  const int32_t* iparent, const double* dist_nearest, const double* w_times_f,
+  double* exc_grad_w, cudaStream_t stream);
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu
index 36f439ee..b792cd0c 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -48,7 +52,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms,
   //constexpr uint32_t warps_per_thread_block = weight_thread_block / cuda::warp_size;
   static_assert( weight_unroll == 4 );
 
-  constexpr double weight_tol = 1e-10;
+  constexpr double weight_tol = integrator::ssf_weight_tol;
   int natom_block = ((natoms + blockDim.x - 1) / blockDim.x) * blockDim.x;
 
   const int tid_x = threadIdx.y + blockIdx.y * blockDim.y;
@@ -87,6 +91,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms,
           contribution = sFrisch( mu );
         }
         contribution = cuda::warp_reduce_prod<cuda::warp_size>(contribution);
+        contribution = __shfl_sync(0xFFFFFFFF, contribution, 0);
 
         parent_weight *= contribution;
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu
index 0eae8d71..722d8c05 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp
index 8807b70f..d9fa216b 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu
index a4a033f6..86799ad2 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu
index 652dc537..03600010 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp
index 9fbd2b3f..fc1a9d44 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu
index 4dd328bd..9470c1c7 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -13,7 +17,8 @@
 
 namespace GauXC {
 
-__global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel(
+template<bool with_weight_derivatives>
+__global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_rks_kernel(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks,
@@ -25,6 +30,7 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel(
     const auto shell           = shell_to_task[ish].shell_device;
     const auto task_idx        = shell_to_task[ish].task_idx_device;
     const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+    const int iCen = shell_to_task[ish].center_idx;
     const uint32_t shsz   = shell->size();
 
     const int global_warp_id = 
@@ -37,6 +43,12 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel(
       const auto*    task   = device_tasks + task_idx[itask];
       const uint32_t npts   = task->npts;
       const size_t   shoff  = task_shell_offs[itask] * npts;
+      const int iParent     = task->iParent;
+      if constexpr( with_weight_derivatives ) {
+        if( iCen == iParent ) 
+          continue;
+      }
+      double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0);
 
       const auto* __restrict__ basis_x_eval = task->dbfx + shoff;
       const auto* __restrict__ basis_y_eval = task->dbfy + shoff;
@@ -57,13 +69,24 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel(
           const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts];
           const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts];
 
-          g_acc_x += z_mu_i * dbfx_mu_i;
-          g_acc_y += z_mu_i * dbfy_mu_i;
-          g_acc_z += z_mu_i * dbfz_mu_i;
+          g_acc_x_task += z_mu_i * dbfx_mu_i;
+          g_acc_y_task += z_mu_i * dbfy_mu_i;
+          g_acc_z_task += z_mu_i * dbfz_mu_i;
         } // Loop over bfns within a shell
 
       } // Loop over points
 
+      g_acc_x += g_acc_x_task;
+      g_acc_y += g_acc_y_task;
+      g_acc_z += g_acc_z_task;
+
+      //write to Parent atom with translational invariance
+      if constexpr( with_weight_derivatives ) {
+        atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task );
+      }
+
     } // Loop over tasks assigned to shell
 
     constexpr auto warp_size = cuda::warp_size;
@@ -72,7 +95,6 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel(
     g_acc_z = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_z );
 
     if( (threadIdx.x % cuda::warp_size) == 0 ) {
-      const int iCen = shell_to_task[ish].center_idx;
       
       atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x );
       atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y );
@@ -83,8 +105,105 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel(
 
 }
 
-void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue queue ) {
+template<bool with_weight_derivatives>
+__global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_uks_kernel(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks,
+  double*            __restrict__ EXC_GRAD
+) {
+
+  for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+    const uint32_t ntasks      = shell_to_task[ish].ntask;
+    const auto shell           = shell_to_task[ish].shell_device;
+    const auto task_idx        = shell_to_task[ish].task_idx_device;
+    const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+    const int  iCen            = shell_to_task[ish].center_idx;
+    const uint32_t shsz   = shell->size();
+
+    const int global_warp_id = 
+      (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+    const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+    double g_acc_x(0), g_acc_y(0), g_acc_z(0);
+    for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+      
+      const auto*    task   = device_tasks + task_idx[itask];
+      const uint32_t npts   = task->npts;
+      const size_t   shoff  = task_shell_offs[itask] * npts;
+      const int    iParent  = task->iParent;
+      if constexpr( with_weight_derivatives ) {
+        if( iCen == iParent )
+          continue;
+      }
+      double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0);
+
+      const auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+      const auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+      const auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+
+      const auto* __restrict__  xmatS = task->xmatS + shoff;
+      const auto* __restrict__  xmatZ = task->xmatZ + shoff;
+      const auto* __restrict__  vrhop = task->vrho_pos;
+      const auto* __restrict__  vrhom = task->vrho_neg;
+
+      #pragma unroll 1
+      for( uint32_t ipt = threadIdx.x % cuda::warp_size; 
+           ipt < npts; 
+           ipt += cuda::warp_size ) {
+
+        const double vrhop_i = vrhop[ipt];
+        const double vrhom_i = vrhom[ipt];
+
+        const auto vrhoS_i = 0.5 * (vrhop_i + vrhom_i);
+        const auto vrhoZ_i = 0.5 * (vrhop_i - vrhom_i);
+        for( uint32_t ibf = 0; ibf < shsz; ++ibf ) {
+          const double zS_mu_i    = vrhoS_i * xmatS[ipt + ibf*npts];
+          const double zZ_mu_i    = vrhoZ_i * xmatZ[ipt + ibf*npts];
+          const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts];
+          const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts];
+          const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts];
+
+          g_acc_x_task += zS_mu_i * dbfx_mu_i;
+          g_acc_y_task += zS_mu_i * dbfy_mu_i;
+          g_acc_z_task += zS_mu_i * dbfz_mu_i;
+          g_acc_x_task += zZ_mu_i * dbfx_mu_i;
+          g_acc_y_task += zZ_mu_i * dbfy_mu_i;
+          g_acc_z_task += zZ_mu_i * dbfz_mu_i;
+        } // Loop over bfns within a shell
+
+      } // Loop over points
+
+      g_acc_x += g_acc_x_task;
+      g_acc_y += g_acc_y_task;
+      g_acc_z += g_acc_z_task;
+
+      //write to Parent atom with translational invariance
+      if constexpr( with_weight_derivatives ) {
+        atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task );
+      }
+
+    } // Loop over tasks assigned to shell
+
+    constexpr auto warp_size = cuda::warp_size;
+    g_acc_x = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_x );
+    g_acc_y = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_y );
+    g_acc_z = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_z );
+
+    if( (threadIdx.x % cuda::warp_size) == 0 ) {
+      atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x );
+      atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y );
+      atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z );
+    }
+
+  } // Loop over shells
+
+}
+
+void increment_exc_grad_lda( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task,
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue queue ) {
 
   cudaStream_t stream = queue.queue_as<util::cuda_stream>();
   #if 0
@@ -98,9 +217,31 @@ void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task,
   dim3 threads(1024), blocks(1,1,nshell);
   #endif
 
-  increment_exc_grad_lda_kernel<<<blocks, threads, 0 , stream>>>(
-    nshell, shell_to_task, device_tasks, EXC_GRAD 
-  );
+  switch(ks_scheme) {
+    case RKS:
+      if (with_weight_derivatives) {
+        increment_exc_grad_lda_rks_kernel<true><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      } else {
+        increment_exc_grad_lda_rks_kernel<false><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      }
+      break;
+    case UKS:
+      if (with_weight_derivatives) {
+        increment_exc_grad_lda_uks_kernel<true><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      } else {
+        increment_exc_grad_lda_uks_kernel<false><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      }
+      break;
+    default: GAUXC_GENERIC_EXCEPTION("LDA EXC GRAD + GKS NYI");
+  }
 }
 
 
@@ -117,11 +258,138 @@ void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task,
 
 
 
+template<bool with_weight_derivatives>
+__global__ __launch_bounds__(512,1) void increment_exc_grad_gga_rks_kernel(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks,
+  double*            __restrict__ EXC_GRAD
+) {
+
+  for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+    const uint32_t ntasks      = shell_to_task[ish].ntask;
+    const auto shell           = shell_to_task[ish].shell_device;
+    const auto task_idx        = shell_to_task[ish].task_idx_device;
+    const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+    const int iCen = shell_to_task[ish].center_idx;
+    const uint32_t shsz   = shell->size();
+
+    const int global_warp_id = 
+      (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+    const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+    double g_acc_x(0), g_acc_y(0), g_acc_z(0);
+    for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+      
+      const auto*    task   = device_tasks + task_idx[itask];
+      const uint32_t npts   = task->npts;
+      const size_t   shoff  = task_shell_offs[itask] * npts;
+      const int iParent     = task->iParent;
+      if constexpr( with_weight_derivatives ) {
+        if( iCen == iParent ) 
+          continue;
+      }
+      double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0);
+
+      const auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+      const auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+      const auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+
+      const auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+      const auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+      const auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+      const auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+      const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+      const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+
+      const auto* __restrict__  xmat = task->zmat + shoff;
+      const auto* __restrict__  xmat_x = task->xmat_x + shoff;
+      const auto* __restrict__  xmat_y = task->xmat_y + shoff;
+      const auto* __restrict__  xmat_z = task->xmat_z + shoff;
+
+      const auto* __restrict__  vrho = task->vrho;
+      const auto* __restrict__  vgamma = task->vgamma;
+
+      const auto* __restrict__ den_x = task->dden_sx;
+      const auto* __restrict__ den_y = task->dden_sy;
+      const auto* __restrict__ den_z = task->dden_sz;
+
+      #pragma unroll 1
+      for( uint32_t ipt = threadIdx.x % cuda::warp_size; 
+           ipt < npts; 
+           ipt += cuda::warp_size ) {
+
+        const double vrho_i   = vrho[ipt];
+        const double vgamma_i = vgamma[ipt];
+
+        const double denx_i = den_x[ipt];
+        const double deny_i = den_y[ipt];
+        const double denz_i = den_z[ipt];
+        for( uint32_t ibf = 0; ibf < shsz; ++ibf ) {
+          const double z_mu_i    = xmat[ipt + ibf*npts];          
+          const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts];
+          const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts];
+          const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts];
+
+          g_acc_x_task += vrho_i * z_mu_i * dbfx_mu_i;
+          g_acc_y_task += vrho_i * z_mu_i * dbfy_mu_i;
+          g_acc_z_task += vrho_i * z_mu_i * dbfz_mu_i;
+
+          const double zx = xmat_x[ipt + ibf*npts];
+          const double zy = xmat_y[ipt + ibf*npts];
+          const double zz = xmat_z[ipt + ibf*npts];
+
+          const double d11_xmat_term = denx_i * zx + deny_i * zy + denz_i * zz;
+
+          const double d2bfxx = basis_xx_eval[ipt + ibf*npts];
+          const double d2bfxy = basis_xy_eval[ipt + ibf*npts];
+          const double d2bfxz = basis_xz_eval[ipt + ibf*npts];
+          const double d2bfyy = basis_yy_eval[ipt + ibf*npts];
+          const double d2bfyz = basis_yz_eval[ipt + ibf*npts];
+          const double d2bfzz = basis_zz_eval[ipt + ibf*npts];
+
+          const double d2_term_x = d2bfxx*denx_i + d2bfxy*deny_i + d2bfxz*denz_i;
+          const double d2_term_y = d2bfxy*denx_i + d2bfyy*deny_i + d2bfyz*denz_i;
+          const double d2_term_z = d2bfxz*denx_i + d2bfyz*deny_i + d2bfzz*denz_i;
+
+          g_acc_x_task += 2 * vgamma_i * ( z_mu_i * d2_term_x + dbfx_mu_i * d11_xmat_term );
+          g_acc_y_task += 2 * vgamma_i * ( z_mu_i * d2_term_y + dbfy_mu_i * d11_xmat_term );
+          g_acc_z_task += 2 * vgamma_i * ( z_mu_i * d2_term_z + dbfz_mu_i * d11_xmat_term );
+
+        } // Loop over bfns within a shell
+
+      } // Loop over points
+
+      g_acc_x += g_acc_x_task;
+      g_acc_y += g_acc_y_task;
+      g_acc_z += g_acc_z_task;
+
+      //write to Parent atom with translational invariance
+      if constexpr( with_weight_derivatives ) {
+        atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task );
+      }
+
+    } // Loop over tasks assigned to shell
+
+    constexpr auto warp_size = cuda::warp_size;
+    g_acc_x = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_x );
+    g_acc_y = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_y );
+    g_acc_z = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_z );
 
+    if( (threadIdx.x % cuda::warp_size) == 0 ) {
+      atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x );
+      atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y );
+      atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z );
+    }
 
+  } // Loop over shells
 
+}
 
-__global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
+template<bool with_weight_derivatives>
+__global__ __launch_bounds__(512,1) void increment_exc_grad_gga_uks_kernel(
   uint32_t                        nshell,
   ShellToTaskDevice* __restrict__ shell_to_task,
   XCDeviceTask*      __restrict__ device_tasks,
@@ -133,6 +401,7 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
     const auto shell           = shell_to_task[ish].shell_device;
     const auto task_idx        = shell_to_task[ish].task_idx_device;
     const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+    const int iCen = shell_to_task[ish].center_idx;
     const uint32_t shsz   = shell->size();
 
     const int global_warp_id = 
@@ -145,6 +414,12 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
       const auto*    task   = device_tasks + task_idx[itask];
       const uint32_t npts   = task->npts;
       const size_t   shoff  = task_shell_offs[itask] * npts;
+      const int iParent     = task->iParent;
+      if constexpr( with_weight_derivatives ) {
+        if( iCen == iParent ) 
+          continue;
+      }
+      double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0);
 
       const auto* __restrict__ basis_x_eval = task->dbfx + shoff;
       const auto* __restrict__ basis_y_eval = task->dbfy + shoff;
@@ -157,13 +432,229 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
       const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
       const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
 
-      const auto* __restrict__  xmat = task->zmat + shoff;
+      const auto* __restrict__  xmatS   = task->xmatS   + shoff;
+      const auto* __restrict__  xmatS_x = task->xmatS_x + shoff;
+      const auto* __restrict__  xmatS_y = task->xmatS_y + shoff;
+      const auto* __restrict__  xmatS_z = task->xmatS_z + shoff;
+
+      const auto* __restrict__  xmatZ   = task->xmatZ   + shoff;
+      const auto* __restrict__  xmatZ_x = task->xmatZ_x + shoff;
+      const auto* __restrict__  xmatZ_y = task->xmatZ_y + shoff;
+      const auto* __restrict__  xmatZ_z = task->xmatZ_z + shoff;
+
+      const auto* __restrict__  vrhop = task->vrho_pos;
+      const auto* __restrict__  vrhom = task->vrho_neg;
+
+      const auto* __restrict__  vgamma_pp = task->vgamma_pp;
+      const auto* __restrict__  vgamma_pm = task->vgamma_pm;
+      const auto* __restrict__  vgamma_mm = task->vgamma_mm;
+
+      const auto* __restrict__ dens_x = task->dden_sx;
+      const auto* __restrict__ dens_y = task->dden_sy;
+      const auto* __restrict__ dens_z = task->dden_sz;
+
+      const auto* __restrict__ denz_x = task->dden_zx;
+      const auto* __restrict__ denz_y = task->dden_zy;
+      const auto* __restrict__ denz_z = task->dden_zz;
+
+      #pragma unroll 1
+      for( uint32_t ipt = threadIdx.x % cuda::warp_size; 
+           ipt < npts; 
+           ipt += cuda::warp_size ) {
+
+        const double vrhop_i = vrhop[ipt];
+        const double vrhom_i = vrhom[ipt];
+        const double vrhoS_i = 0.5 * (vrhop_i + vrhom_i);
+        const double vrhoZ_i = 0.5 * (vrhop_i - vrhom_i);
+
+        const double vgammapp_i = vgamma_pp[ipt];
+        const double vgammapm_i = vgamma_pm[ipt];
+        const double vgammamm_i = vgamma_mm[ipt];
+
+        const double denSx_i = dens_x[ipt];
+        const double denSy_i = dens_y[ipt];
+        const double denSz_i = dens_z[ipt];
+        const double denZx_i = denz_x[ipt];
+        const double denZy_i = denz_y[ipt];
+        const double denZz_i = denz_z[ipt];
+
+        for( uint32_t ibf = 0; ibf < shsz; ++ibf ) {
+          const double xN    = xmatS[ipt + ibf*npts];
+          const double xZ    = xmatZ[ipt + ibf*npts];
+          const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts];
+          const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts];
+          const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts];
+
+          g_acc_x_task += vrhoS_i * xN * dbfx_mu_i;
+          g_acc_y_task += vrhoS_i * xN * dbfy_mu_i;
+          g_acc_z_task += vrhoS_i * xN * dbfz_mu_i;
+          g_acc_x_task += vrhoZ_i * xZ * dbfx_mu_i;
+          g_acc_y_task += vrhoZ_i * xZ * dbfy_mu_i;
+          g_acc_z_task += vrhoZ_i * xZ * dbfz_mu_i;
+
+          const double xNx = xmatS_x[ipt + ibf*npts];
+          const double xNy = xmatS_y[ipt + ibf*npts];
+          const double xNz = xmatS_z[ipt + ibf*npts];
+          const double xZx = xmatZ_x[ipt + ibf*npts];
+          const double xZy = xmatZ_y[ipt + ibf*npts];
+          const double xZz = xmatZ_z[ipt + ibf*npts];
+
+          const double d11nn_xmat_term = denSx_i * xNx + denSy_i * xNy + denSz_i * xNz;
+          const double d11nz_xmat_term = denSx_i * xZx + denSy_i * xZy + denSz_i * xZz;
+          const double d11zn_xmat_term = denZx_i * xNx + denZy_i * xNy + denZz_i * xNz;
+          const double d11zz_xmat_term = denZx_i * xZx + denZy_i * xZy + denZz_i * xZz;
+
+          const double d2bfxx = basis_xx_eval[ipt + ibf*npts];
+          const double d2bfxy = basis_xy_eval[ipt + ibf*npts];
+          const double d2bfxz = basis_xz_eval[ipt + ibf*npts];
+          const double d2bfyy = basis_yy_eval[ipt + ibf*npts];
+          const double d2bfyz = basis_yz_eval[ipt + ibf*npts];
+          const double d2bfzz = basis_zz_eval[ipt + ibf*npts];
+
+          const double d2n_term_x = d2bfxx*denSx_i + d2bfxy*denSy_i + d2bfxz*denSz_i;
+          const double d2n_term_y = d2bfxy*denSx_i + d2bfyy*denSy_i + d2bfyz*denSz_i;
+          const double d2n_term_z = d2bfxz*denSx_i + d2bfyz*denSy_i + d2bfzz*denSz_i;
+          const double d2z_term_x = d2bfxx*denZx_i + d2bfxy*denZy_i + d2bfxz*denZz_i;
+          const double d2z_term_y = d2bfxy*denZx_i + d2bfyy*denZy_i + d2bfyz*denZz_i;
+          const double d2z_term_z = d2bfxz*denZx_i + d2bfyz*denZy_i + d2bfzz*denZz_i;
+
+          g_acc_x_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_x * xN + d11nn_xmat_term * dbfx_mu_i);
+          g_acc_x_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2z_term_x * xN + d11zn_xmat_term * dbfx_mu_i);
+          g_acc_x_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2n_term_x * xZ + d11nz_xmat_term * dbfx_mu_i);
+          g_acc_x_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_x * xZ + d11zz_xmat_term * dbfx_mu_i);
+
+          g_acc_y_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_y * xN + d11nn_xmat_term * dbfy_mu_i);
+          g_acc_y_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2z_term_y * xN + d11zn_xmat_term * dbfy_mu_i);
+          g_acc_y_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2n_term_y * xZ + d11nz_xmat_term * dbfy_mu_i);
+          g_acc_y_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_y * xZ + d11zz_xmat_term * dbfy_mu_i);
+
+          g_acc_z_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_z * xN + d11nn_xmat_term * dbfz_mu_i);
+          g_acc_z_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2z_term_z * xN + d11zn_xmat_term * dbfz_mu_i);
+          g_acc_z_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2n_term_z * xZ + d11nz_xmat_term * dbfz_mu_i);
+          g_acc_z_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_z * xZ + d11zz_xmat_term * dbfz_mu_i);
+
+        }// Loop over bfns within a shell
+
+      } // Loop over points
+
+      g_acc_x += g_acc_x_task;
+      g_acc_y += g_acc_y_task;
+      g_acc_z += g_acc_z_task;
+
+      //write to Parent atom with translational invariance
+      if constexpr( with_weight_derivatives ) {
+        atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task );
+      }
+
+    } // Loop over tasks assigned to shell
+
+    constexpr auto warp_size = cuda::warp_size;
+    g_acc_x = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_x );
+    g_acc_y = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_y );
+    g_acc_z = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_z );
+
+    if( (threadIdx.x % cuda::warp_size) == 0 ) {
+      atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x );
+      atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y );
+      atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z );
+    }
+
+  } // Loop over shells
+
+}
+
+void increment_exc_grad_gga( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task,
+  XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue queue ) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+  dim3 threads(512), blocks(1,1,nshell);
+
+  switch(ks_scheme) {
+    case RKS:
+      if (with_weight_derivatives) {
+        increment_exc_grad_gga_rks_kernel<true><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      } else {
+        increment_exc_grad_gga_rks_kernel<false><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      }
+      break;
+    case UKS:
+      if (with_weight_derivatives) {
+        increment_exc_grad_gga_uks_kernel<true><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      } else {
+        increment_exc_grad_gga_uks_kernel<false><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      }
+      break;
+    default: GAUXC_GENERIC_EXCEPTION("GGA EXC GRAD + GKS NYI");
+  }
+}
+
+
+
+
+
+
+template<bool with_weight_derivatives>
+__global__ __launch_bounds__(512,1) void increment_exc_grad_mgga_rks_kernel(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks,
+  double*            __restrict__ EXC_GRAD
+) {
+
+  for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+    const uint32_t ntasks      = shell_to_task[ish].ntask;
+    const auto shell           = shell_to_task[ish].shell_device;
+    const auto task_idx        = shell_to_task[ish].task_idx_device;
+    const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+    const int iCen = shell_to_task[ish].center_idx;
+    const uint32_t shsz   = shell->size();
+
+    const int global_warp_id = 
+      (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+    const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+    double g_acc_x(0), g_acc_y(0), g_acc_z(0);
+    for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
+      
+      const auto*    task   = device_tasks + task_idx[itask];
+      const uint32_t npts   = task->npts;
+      const size_t   shoff  = task_shell_offs[itask] * npts;
+      const int iParent     = task->iParent;
+      if constexpr( with_weight_derivatives ) {
+        if( iCen == iParent )
+          continue;
+      }
+      double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0);
+
+      const auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+      const auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+      const auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+
+      const auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+      const auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+      const auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+      const auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+      const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+      const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+
+      const auto* __restrict__  xmat   = task->zmat + shoff;
       const auto* __restrict__  xmat_x = task->xmat_x + shoff;
       const auto* __restrict__  xmat_y = task->xmat_y + shoff;
       const auto* __restrict__  xmat_z = task->xmat_z + shoff;
 
-      const auto* __restrict__  vrho = task->vrho;
+      const auto* __restrict__  vrho   = task->vrho;
       const auto* __restrict__  vgamma = task->vgamma;
+      const auto* __restrict__  vtau   = task->vtau;
 
       const auto* __restrict__ den_x = task->dden_sx;
       const auto* __restrict__ den_y = task->dden_sy;
@@ -176,6 +667,7 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
 
         const double vrho_i   = vrho[ipt];
         const double vgamma_i = vgamma[ipt];
+        const double vtau_i   = 0.5 * vtau[ipt];
 
         const double denx_i = den_x[ipt];
         const double deny_i = den_y[ipt];
@@ -186,15 +678,15 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
           const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts];
           const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts];
 
-          g_acc_x += vrho_i * z_mu_i * dbfx_mu_i;
-          g_acc_y += vrho_i * z_mu_i * dbfy_mu_i;
-          g_acc_z += vrho_i * z_mu_i * dbfz_mu_i;
+          g_acc_x_task += vrho_i * z_mu_i * dbfx_mu_i;
+          g_acc_y_task += vrho_i * z_mu_i * dbfy_mu_i;
+          g_acc_z_task += vrho_i * z_mu_i * dbfz_mu_i;
 
           const double zx = xmat_x[ipt + ibf*npts];
           const double zy = xmat_y[ipt + ibf*npts];
           const double zz = xmat_z[ipt + ibf*npts];
 
-	        const double d11_xmat_term = denx_i * zx + deny_i * zy + denz_i * zz;
+          const double d11_xmat_term = denx_i * zx + deny_i * zy + denz_i * zz;
 
           const double d2bfxx = basis_xx_eval[ipt + ibf*npts];
           const double d2bfxy = basis_xy_eval[ipt + ibf*npts];
@@ -203,18 +695,41 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
           const double d2bfyz = basis_yz_eval[ipt + ibf*npts];
           const double d2bfzz = basis_zz_eval[ipt + ibf*npts];
 
-	        const double d2_term_x = d2bfxx*denx_i + d2bfxy*deny_i + d2bfxz*denz_i;
-	        const double d2_term_y = d2bfxy*denx_i + d2bfyy*deny_i + d2bfyz*denz_i;
-	        const double d2_term_z = d2bfxz*denx_i + d2bfyz*deny_i + d2bfzz*denz_i;
+          {
+          const double d2_term_x = d2bfxx*denx_i + d2bfxy*deny_i + d2bfxz*denz_i;
+          const double d2_term_y = d2bfxy*denx_i + d2bfyy*deny_i + d2bfyz*denz_i;
+          const double d2_term_z = d2bfxz*denx_i + d2bfyz*deny_i + d2bfzz*denz_i;
+
+          g_acc_x_task += 2 * vgamma_i * ( z_mu_i * d2_term_x + dbfx_mu_i * d11_xmat_term );
+          g_acc_y_task += 2 * vgamma_i * ( z_mu_i * d2_term_y + dbfy_mu_i * d11_xmat_term );
+          g_acc_z_task += 2 * vgamma_i * ( z_mu_i * d2_term_z + dbfz_mu_i * d11_xmat_term );
+          }
+
+          {
+          const double d2_term_x = d2bfxx*zx + d2bfxy*zy + d2bfxz*zz;
+          const double d2_term_y = d2bfxy*zx + d2bfyy*zy + d2bfyz*zz;
+          const double d2_term_z = d2bfxz*zx + d2bfyz*zy + d2bfzz*zz;
 
-	        g_acc_x += 2 * vgamma_i * ( z_mu_i * d2_term_x + dbfx_mu_i * d11_xmat_term );
-	        g_acc_y += 2 * vgamma_i * ( z_mu_i * d2_term_y + dbfy_mu_i * d11_xmat_term );
-	        g_acc_z += 2 * vgamma_i * ( z_mu_i * d2_term_z + dbfz_mu_i * d11_xmat_term );
+          g_acc_x_task += vtau_i * d2_term_x;
+          g_acc_y_task += vtau_i * d2_term_y;
+          g_acc_z_task += vtau_i * d2_term_z;
+          }
 
         } // Loop over bfns within a shell
 
       } // Loop over points
 
+      g_acc_x += g_acc_x_task;
+      g_acc_y += g_acc_y_task;
+      g_acc_z += g_acc_z_task;
+
+      //write to Parent atom with translational invariance
+      if constexpr( with_weight_derivatives ) {
+        atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task );
+      }
+
     } // Loop over tasks assigned to shell
 
     constexpr auto warp_size = cuda::warp_size;
@@ -223,8 +738,206 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
     g_acc_z = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_z );
 
     if( (threadIdx.x % cuda::warp_size) == 0 ) {
-      const int iCen = shell_to_task[ish].center_idx;
+      atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x );
+      atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y );
+      atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z );
+    }
+
+  } // Loop over shells
+
+}
+
+template<bool with_weight_derivatives>
+__global__ __launch_bounds__(512,1) void increment_exc_grad_mgga_uks_kernel(
+  uint32_t                        nshell,
+  ShellToTaskDevice* __restrict__ shell_to_task,
+  XCDeviceTask*      __restrict__ device_tasks,
+  double*            __restrict__ EXC_GRAD
+) {
+  for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) {
+    const uint32_t ntasks      = shell_to_task[ish].ntask;
+    const auto shell           = shell_to_task[ish].shell_device;
+    const auto task_idx        = shell_to_task[ish].task_idx_device;
+    const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device;
+    const int iCen = shell_to_task[ish].center_idx;
+    const uint32_t shsz   = shell->size();
+
+    const int global_warp_id = 
+      (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size;
+    const int nwarp_global   = max((blockDim.x*gridDim.x) / cuda::warp_size,1);
+
+    double g_acc_x(0), g_acc_y(0), g_acc_z(0);
+    for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) {
       
+      const auto*    task   = device_tasks + task_idx[itask];
+      const uint32_t npts   = task->npts;
+      const size_t   shoff  = task_shell_offs[itask] * npts;
+      const int iParent     = task->iParent;
+      if constexpr( with_weight_derivatives ) {
+        if( iCen == iParent ) 
+          continue;
+      }
+      double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0);
+
+      const auto* __restrict__ basis_x_eval = task->dbfx + shoff;
+      const auto* __restrict__ basis_y_eval = task->dbfy + shoff;
+      const auto* __restrict__ basis_z_eval = task->dbfz + shoff;
+
+      const auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff;
+      const auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff;
+      const auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff;
+      const auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff;
+      const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff;
+      const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff;
+
+      const auto* __restrict__  xmatS   = task->xmatS   + shoff;
+      const auto* __restrict__  xmatS_x = task->xmatS_x + shoff;
+      const auto* __restrict__  xmatS_y = task->xmatS_y + shoff;
+      const auto* __restrict__  xmatS_z = task->xmatS_z + shoff;
+
+      const auto* __restrict__  xmatZ   = task->xmatZ   + shoff;
+      const auto* __restrict__  xmatZ_x = task->xmatZ_x + shoff;
+      const auto* __restrict__  xmatZ_y = task->xmatZ_y + shoff;
+      const auto* __restrict__  xmatZ_z = task->xmatZ_z + shoff;
+
+      const auto* __restrict__  vrhop = task->vrho_pos;
+      const auto* __restrict__  vrhom = task->vrho_neg;
+      const auto* __restrict__  vtaup = task->vtau_pos;
+      const auto* __restrict__  vtaum = task->vtau_neg;
+
+      const auto* __restrict__  vgamma_pp = task->vgamma_pp;
+      const auto* __restrict__  vgamma_pm = task->vgamma_pm;
+      const auto* __restrict__  vgamma_mm = task->vgamma_mm;
+
+      const auto* __restrict__ dens_x = task->dden_sx;
+      const auto* __restrict__ dens_y = task->dden_sy;
+      const auto* __restrict__ dens_z = task->dden_sz;
+
+      const auto* __restrict__ denz_x = task->dden_zx;
+      const auto* __restrict__ denz_y = task->dden_zy;
+      const auto* __restrict__ denz_z = task->dden_zz;
+
+      #pragma unroll 1
+      for( uint32_t ipt = threadIdx.x % cuda::warp_size; 
+           ipt < npts; 
+           ipt += cuda::warp_size ) {
+
+        const double vrhop_i = vrhop[ipt];
+        const double vrhom_i = vrhom[ipt];
+        const double vrhoS_i = 0.5 * (vrhop_i + vrhom_i);
+        const double vrhoZ_i = 0.5 * (vrhop_i - vrhom_i);        
+
+        const double vtaup_i = 0.5 * vtaup[ipt];
+        const double vtaum_i = 0.5 * vtaum[ipt];
+        const double vtauS_i = 0.5 * (vtaup_i + vtaum_i);
+        const double vtauZ_i = 0.5 * (vtaup_i - vtaum_i);
+
+        const double vgammapp_i = vgamma_pp[ipt];
+        const double vgammapm_i = vgamma_pm[ipt];
+        const double vgammamm_i = vgamma_mm[ipt];
+
+        const double denSx_i = dens_x[ipt];
+        const double denSy_i = dens_y[ipt];
+        const double denSz_i = dens_z[ipt];
+        const double denZx_i = denz_x[ipt];
+        const double denZy_i = denz_y[ipt];
+        const double denZz_i = denz_z[ipt];
+
+        for( uint32_t ibf = 0; ibf < shsz; ++ibf ) {
+          const double xN    = xmatS[ipt + ibf*npts];
+          const double xZ    = xmatZ[ipt + ibf*npts];
+          const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts];
+          const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts];
+          const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts];
+
+          g_acc_x_task += vrhoS_i * xN * dbfx_mu_i;
+          g_acc_y_task += vrhoS_i * xN * dbfy_mu_i;
+          g_acc_z_task += vrhoS_i * xN * dbfz_mu_i;
+          g_acc_x_task += vrhoZ_i * xZ * dbfx_mu_i;
+          g_acc_y_task += vrhoZ_i * xZ * dbfy_mu_i;
+          g_acc_z_task += vrhoZ_i * xZ * dbfz_mu_i;
+
+          const double xNx = xmatS_x[ipt + ibf*npts];
+          const double xNy = xmatS_y[ipt + ibf*npts];
+          const double xNz = xmatS_z[ipt + ibf*npts];
+          const double xZx = xmatZ_x[ipt + ibf*npts];
+          const double xZy = xmatZ_y[ipt + ibf*npts];
+          const double xZz = xmatZ_z[ipt + ibf*npts];
+
+          const double d11nn_xmat_term = denSx_i * xNx + denSy_i * xNy + denSz_i * xNz;
+          const double d11nz_xmat_term = denSx_i * xZx + denSy_i * xZy + denSz_i * xZz;
+          const double d11zn_xmat_term = denZx_i * xNx + denZy_i * xNy + denZz_i * xNz;
+          const double d11zz_xmat_term = denZx_i * xZx + denZy_i * xZy + denZz_i * xZz;
+
+          const double d2bfxx = basis_xx_eval[ipt + ibf*npts];
+          const double d2bfxy = basis_xy_eval[ipt + ibf*npts];
+          const double d2bfxz = basis_xz_eval[ipt + ibf*npts];
+          const double d2bfyy = basis_yy_eval[ipt + ibf*npts];
+          const double d2bfyz = basis_yz_eval[ipt + ibf*npts];
+          const double d2bfzz = basis_zz_eval[ipt + ibf*npts];
+
+          {
+          const double d2n_term_x = d2bfxx*denSx_i + d2bfxy*denSy_i + d2bfxz*denSz_i;
+          const double d2n_term_y = d2bfxy*denSx_i + d2bfyy*denSy_i + d2bfyz*denSz_i;
+          const double d2n_term_z = d2bfxz*denSx_i + d2bfyz*denSy_i + d2bfzz*denSz_i;
+          const double d2z_term_x = d2bfxx*denZx_i + d2bfxy*denZy_i + d2bfxz*denZz_i;
+          const double d2z_term_y = d2bfxy*denZx_i + d2bfyy*denZy_i + d2bfyz*denZz_i;
+          const double d2z_term_z = d2bfxz*denZx_i + d2bfyz*denZy_i + d2bfzz*denZz_i;          
+
+          g_acc_x_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_x * xN + d11nn_xmat_term * dbfx_mu_i);
+          g_acc_x_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2z_term_x * xN + d11zn_xmat_term * dbfx_mu_i);
+          g_acc_x_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2n_term_x * xZ + d11nz_xmat_term * dbfx_mu_i);
+          g_acc_x_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_x * xZ + d11zz_xmat_term * dbfx_mu_i);
+
+          g_acc_y_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_y * xN + d11nn_xmat_term * dbfy_mu_i);
+          g_acc_y_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2z_term_y * xN + d11zn_xmat_term * dbfy_mu_i);
+          g_acc_y_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2n_term_y * xZ + d11nz_xmat_term * dbfy_mu_i);
+          g_acc_y_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_y * xZ + d11zz_xmat_term * dbfy_mu_i);
+
+          g_acc_z_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_z * xN + d11nn_xmat_term * dbfz_mu_i);
+          g_acc_z_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2z_term_z * xN + d11zn_xmat_term * dbfz_mu_i);
+          g_acc_z_task += 0.5 * (vgammapp_i              - vgammamm_i) * (d2n_term_z * xZ + d11nz_xmat_term * dbfz_mu_i);
+          g_acc_z_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_z * xZ + d11zz_xmat_term * dbfz_mu_i);
+          }
+
+          {
+          const double d2n_term_x = d2bfxx*xNx + d2bfxy*xNy + d2bfxz*xNz;
+          const double d2n_term_y = d2bfxy*xNx + d2bfyy*xNy + d2bfyz*xNz;
+          const double d2n_term_z = d2bfxz*xNx + d2bfyz*xNy + d2bfzz*xNz;
+          const double d2z_term_x = d2bfxx*xZx + d2bfxy*xZy + d2bfxz*xZz;
+          const double d2z_term_y = d2bfxy*xZx + d2bfyy*xZy + d2bfyz*xZz;
+          const double d2z_term_z = d2bfxz*xZx + d2bfyz*xZy + d2bfzz*xZz;
+          g_acc_x_task += vtauS_i * d2n_term_x;
+          g_acc_y_task += vtauS_i * d2n_term_y;
+          g_acc_z_task += vtauS_i * d2n_term_z;
+
+          g_acc_x_task += vtauZ_i * d2z_term_x;
+          g_acc_y_task += vtauZ_i * d2z_term_y;
+          g_acc_z_task += vtauZ_i * d2z_term_z;
+          }
+        }// Loop over bfns within a shell
+
+      } // Loop over points
+
+      g_acc_x += g_acc_x_task;
+      g_acc_y += g_acc_y_task;
+      g_acc_z += g_acc_z_task;
+
+      //write to Parent atom with translational invariance
+      if constexpr( with_weight_derivatives ) {
+        atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task );
+        atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task );
+      }
+
+    } // Loop over tasks assigned to shell
+
+    constexpr auto warp_size = cuda::warp_size;
+    g_acc_x = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_x );
+    g_acc_y = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_y );
+    g_acc_z = -2. * cuda::warp_reduce_sum<warp_size>( g_acc_z );
+
+    if( (threadIdx.x % cuda::warp_size) == 0 ) {
       atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x );
       atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y );
       atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z );
@@ -234,15 +947,40 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
 
 }
 
-void increment_exc_grad_gga( size_t nshell, ShellToTaskDevice* shell_to_task,
-  XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue queue ) {
+void increment_exc_grad_mgga( integrator_ks_scheme ks_scheme, size_t nshell, bool need_lapl, 
+  ShellToTaskDevice* shell_to_task, XCDeviceTask* device_tasks, 
+  double* EXC_GRAD, bool with_weight_derivatives, device_queue queue ) {
+ 
+  if(need_lapl) GAUXC_GENERIC_EXCEPTION("CUDA + MGGA/LAPL EXC GRAD NYI");
 
   cudaStream_t stream = queue.queue_as<util::cuda_stream>();
   dim3 threads(512), blocks(1,1,nshell);
 
-  increment_exc_grad_gga_kernel<<<blocks, threads, 0 , stream>>>(
-    nshell, shell_to_task, device_tasks, EXC_GRAD 
-  );
+  switch(ks_scheme) {
+    case RKS:
+      if (with_weight_derivatives) {
+        increment_exc_grad_mgga_rks_kernel<true><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      } else {
+        increment_exc_grad_mgga_rks_kernel<false><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      }
+      break;
+    case UKS:
+      if (with_weight_derivatives) {
+        increment_exc_grad_mgga_uks_kernel<true><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      } else {
+        increment_exc_grad_mgga_uks_kernel<false><<<blocks, threads, 0 , stream>>>(
+          nshell, shell_to_task, device_tasks, EXC_GRAD 
+        );
+      }
+      break;
+    default: GAUXC_GENERIC_EXCEPTION("GGA EXC GRAD + GKS NYI");
+  }
 }
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu
index 4cb165a8..a000efe7 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu
index 6a5cdfa0..91ccb294 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
index 6aea5225..3c5f2020 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
@@ -1,422 +1,26 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #include "device/common/uvvars.hpp"
 #include "cuda_extensions.hpp"
-#include "device_specific/cuda_device_constants.hpp"
 #include <gauxc/util/div_ceil.hpp>
-#include "device_specific/cuda_util.hpp"
-#include "device/xc_device_data.hpp"
-
-namespace GauXC {
-
-#define VVAR_KERNEL_SM_BLOCK 32
-#define GGA_KERNEL_SM_WARPS 16
-#define MGGA_KERNEL_SM_BLOCK 32
-
-__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
-  // eval_vvars populated uvar storage already in the case of LDA+RKS
-  return;
-}
-
-__global__ void eval_uvars_lda_uks_kernel( size_t        ntasks,
-                                       XCDeviceTask* tasks_device ) {
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  auto& task = tasks_device[ batch_idx ];
-
-  const auto npts            = task.npts;
-
-  auto* den_pos_eval_device   = task.den_s;
-  auto* den_neg_eval_device   = task.den_z;
-
-
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-
-  if( tid < npts ) {
-    const auto ps = den_pos_eval_device[ tid ];
-    const auto pz = den_neg_eval_device[ tid ];
-    den_pos_eval_device[ tid ] = 0.5*(ps + pz);
-    den_neg_eval_device[ tid ] = 0.5*(ps - pz);
-
-  }
-}
-
-__global__ void eval_uvars_lda_gks_kernel( size_t        ntasks,
-                                       XCDeviceTask* tasks_device ) {
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  auto& task = tasks_device[ batch_idx ];
-
-  const auto npts            = task.npts;
-
-  auto* den_z_eval_device   = task.den_s;
-  auto* den_s_eval_device   = task.den_z;
-  auto* den_y_eval_device   = task.den_y;
-  auto* den_x_eval_device   = task.den_x;
-  auto* K_z_eval_device     = task.K_z;
-  auto* K_y_eval_device     = task.K_y;
-  auto* K_x_eval_device     = task.K_x;
-  const double dtolsq = 1e-24;  // TODO: make variable
-
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-
-  if( tid < npts ) {
-    const auto ps = den_s_eval_device[ tid ];
-    const auto pz = den_z_eval_device[ tid ];
-    const auto py = den_y_eval_device[ tid ];
-    const auto px = den_x_eval_device[ tid ];
-    const auto mtemp = pz*pz + px*px + py*py;
-    double mnorm = 0.;
-  
-    if (mtemp > dtolsq) {
-      const double inv_mnorm = rsqrt(mtemp);
-      mnorm = 1./inv_mnorm;
-      K_z_eval_device[ tid ] = pz * inv_mnorm;
-      K_y_eval_device[ tid ] = py * inv_mnorm;
-      K_x_eval_device[ tid ] = px * inv_mnorm;
-    }
-    else {
-      mnorm = (1. / 3.) * (px + py + pz);
-      K_z_eval_device[ tid ] = 1. / 3.;
-      K_y_eval_device[ tid ] = 1. / 3.;
-      K_x_eval_device[ tid ] = 1. / 3.;
-    }
-
-    den_s_eval_device[ tid ] = 0.5*(ps + mnorm);
-    den_z_eval_device[ tid ] = 0.5*(ps - mnorm);
-
-  }
-}
-
-
-__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-  
-  const auto& task = tasks_device[ batch_idx ];
-  const auto npts  = task.npts;
-  
-  const auto*   dden_sx_eval_device = task.dden_sx;
-  const auto*   dden_sy_eval_device = task.dden_sy;
-  const auto*   dden_sz_eval_device = task.dden_sz;
-  auto*         gamma_eval_device   = task.gamma;
-
-  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if( tid < npts ) {
-    const double dx = dden_sx_eval_device[ tid ];
-    const double dy = dden_sy_eval_device[ tid ];
-    const double dz = dden_sz_eval_device[ tid ];
-
-    gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz;
-
-  }
-
-}
-
-__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  const auto& task = tasks_device[ batch_idx ];
-  const auto npts            = task.npts;
-
-  auto*           den_pos_eval_device   = task.den_s;
-  const auto*     den_pos_x_eval_device = task.dden_sx;
-  const auto*     den_pos_y_eval_device = task.dden_sy;
-  const auto*     den_pos_z_eval_device = task.dden_sz;
-
-  auto*           den_neg_eval_device   = task.den_z;
-  const auto*     den_neg_x_eval_device = task.dden_zx;
-  const auto*     den_neg_y_eval_device = task.dden_zy;
-  const auto*     den_neg_z_eval_device = task.dden_zz;
-
-  auto*     gamma_pp_eval_device  = task.gamma_pp;
-  auto*     gamma_pm_eval_device  = task.gamma_pm;
-  auto*     gamma_mm_eval_device  = task.gamma_mm;
-
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if( tid < npts ) {
-    const double ps     = den_pos_eval_device[ tid ];
-    const double pz     = den_neg_eval_device[ tid ];
-    const double dndx   = den_pos_x_eval_device[ tid ];
-    const double dndy   = den_pos_y_eval_device[ tid ];
-    const double dndz   = den_pos_z_eval_device[ tid ];
-    const double dMzdx  = den_neg_x_eval_device[ tid ];
-    const double dMzdy  = den_neg_y_eval_device[ tid ];
-    const double dMzdz  = den_neg_z_eval_device[ tid ];
-
-    // (del n).(del n)
-    const auto dn_sq  = dndx*dndx + dndy*dndy + dndz*dndz;
-    // (del Mz).(del Mz)
-    const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz;
-    // (del n).(del Mz)
-    const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz;
-
-    gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz;
-    gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq);
-    gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz;
-
-    den_pos_eval_device[ tid ] = 0.5*(ps + pz);
-    den_neg_eval_device[ tid ] = 0.5*(ps - pz);
-  }
-
-}
-
-__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  const auto& task = tasks_device[ batch_idx ];
-  const auto npts            = task.npts;
-
-        auto*     den_s_eval_device   = task.den_s;
-  const auto*     dden_sx_eval_device = task.dden_sx;
-  const auto*     dden_sy_eval_device = task.dden_sy;
-  const auto*     dden_sz_eval_device = task.dden_sz;
-
-        auto*     den_z_eval_device   = task.den_z;
-  const auto*     dden_zx_eval_device = task.dden_zx;
-  const auto*     dden_zy_eval_device = task.dden_zy;
-  const auto*     dden_zz_eval_device = task.dden_zz;
-
-  const auto*     den_y_eval_device   = task.den_y;
-  const auto*     dden_yx_eval_device = task.dden_yx;
-  const auto*     dden_yy_eval_device = task.dden_yy;
-  const auto*     dden_yz_eval_device = task.dden_yz;
-
-  const auto*     den_x_eval_device   = task.den_x;
-  const auto*     dden_xx_eval_device = task.dden_xx;
-  const auto*     dden_xy_eval_device = task.dden_xy;
-  const auto*     dden_xz_eval_device = task.dden_xz;
-
-  auto*     gamma_pp_eval_device  = task.gamma_pp;
-  auto*     gamma_pm_eval_device  = task.gamma_pm;
-  auto*     gamma_mm_eval_device  = task.gamma_mm;
-
-  auto*     H_z_eval_device = task.H_z;
-  auto*     H_y_eval_device = task.H_y;
-  auto*     H_x_eval_device = task.H_x;
-  auto*     K_z_eval_device = task.K_z;
-  auto*     K_y_eval_device = task.K_y;
-  auto*     K_x_eval_device = task.K_x;
-
-  const double dtolsq = 1e-24;  // TODO: make variable
-
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if( tid < npts ) {
-    const double dndz = dden_sz_eval_device[ tid ];
-    const double dndy = dden_sy_eval_device[ tid ];
-    const double dndx = dden_sx_eval_device[ tid ];
-
-    const double dMzdz = dden_zz_eval_device[ tid ];
-    const double dMzdy = dden_zy_eval_device[ tid ];
-    const double dMzdx = dden_zx_eval_device[ tid ];
-
-    const double dMydz = dden_yz_eval_device[ tid ];
-    const double dMydy = dden_yy_eval_device[ tid ];
-    const double dMydx = dden_yx_eval_device[ tid ];
-
-    const double dMxdz = dden_xz_eval_device[ tid ];
-    const double dMxdy = dden_xy_eval_device[ tid ];
-    const double dMxdx = dden_xx_eval_device[ tid ];
-
-    const auto ps = den_s_eval_device[ tid ];
-    const auto pz = den_z_eval_device[ tid ];
-    const auto py = den_y_eval_device[ tid ];
-    const auto px = den_x_eval_device[ tid ];
-
-    const auto mtemp = pz*pz + px*px + py*py;
-    double mnorm = 0.;
-
-    const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz;
-    const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz;
-    const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz;
-    const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz;
-
-    const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz;
-    const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz;
-    const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz;
-
-    const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely;
-    const auto s_sum =
-               dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py;
-
-    const auto inv_sqsum2 =
-        rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx +
-             dels_dot_dely * dels_dot_dely);
-    const auto sqsum2 = 1./inv_sqsum2;
-
-    double sign = 1.;
-    if( signbit(s_sum)) 
-      sign = -1.;
-
-
-    if (mtemp > dtolsq) {
-      const double inv_mnorm = rsqrt(mtemp);
-      mnorm = 1./inv_mnorm;
-      K_z_eval_device[ tid ] = pz * inv_mnorm;
-      K_y_eval_device[ tid ] = py * inv_mnorm;
-      K_x_eval_device[ tid ] = px * inv_mnorm;
-      H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2;
-      H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2;
-      H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2;
-    }
-    else {
-      mnorm = (1. / 3.) * (px + py + pz);
-      K_z_eval_device[ tid ] = 1. / 3.;
-      K_y_eval_device[ tid ] = 1. / 3.;
-      K_x_eval_device[ tid ] = 1. / 3.;
-
-      H_z_eval_device[ tid ] = sign / 3.;
-      H_y_eval_device[ tid ] = sign / 3.;
-      H_x_eval_device[ tid ] = sign / 3.;
-    }
-
-    gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2;
-    gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum);
-    gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2;
-
-    den_s_eval_device[ tid ] = 0.5*(ps + mnorm);
-    den_z_eval_device[ tid ] = 0.5*(ps - mnorm);
-
-  }
-
-}
-
-template <bool need_lapl>
-__global__ void eval_uvars_mgga_rks_kernel( size_t           ntasks,
-                                       XCDeviceTask* tasks_device ) {
-
-  constexpr auto warp_size = cuda::warp_size;
-  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  auto& task = tasks_device[ batch_idx ];
-
-  const auto npts            = task.npts;
-  const auto nbf             = task.bfn_screening.nbe;
-
-  auto* tau_eval_device   = task.tau;
-  decltype(tau_eval_device) lapl_eval_device = nullptr;
-  if constexpr (need_lapl) {
-    lapl_eval_device = task.denlapl;
-  }
-
-  //const auto* basis_eval_device = task.bf;
-  const auto* dbasis_x_eval_device = task.dbfx;
-  const auto* dbasis_y_eval_device = task.dbfy;
-  const auto* dbasis_z_eval_device = task.dbfz;
-  decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr;
-  if constexpr (need_lapl) {
-    basis_lapl_eval_device = task.d2bflapl;
-  }
-
-  //const auto* den_basis_prod_device    = task.zmat;
-  const auto* den_basis_dx_prod_device = task.xmat_x;
-  const auto* den_basis_dy_prod_device = task.xmat_y;
-  const auto* den_basis_dz_prod_device = task.xmat_z;
-  decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr;
-  if constexpr (need_lapl) {
-    den_basis_prod_device = task.zmat;
-  }
-
-  __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1];
-
-  for ( int bid_x = blockIdx.x * blockDim.x; 
-        bid_x < nbf;
-        bid_x += blockDim.x * gridDim.x ) {
-    
-    for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; 
-          bid_y < npts;
-          bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) {
-        
-      for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
-        den_shared[0][threadIdx.x][sm_y] = 0.;
-        den_shared[1][threadIdx.x][sm_y] = 0.;
-        den_shared[2][threadIdx.x][sm_y] = 0.;
-        if constexpr (need_lapl)
-          den_shared[3][threadIdx.x][sm_y] = 0.;
-
-        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
-          const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts;
-          const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts;
-          const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts;
-
-          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
-          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
-          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
-
-
-          den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ];
-          den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ];
-          den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ];
-
-
-          if constexpr (need_lapl) {
-            const double* db_col   = den_basis_prod_device  + (bid_x + sm_y)*npts;
-            const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts;
-            den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          }
-        }
-      }
-      __syncthreads();
-
-
-      for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
-        const int tid_y = bid_y + sm_y;
-
-        register double tx_reg  = den_shared[0][sm_y][threadIdx.x];
-        register double ty_reg  = den_shared[1][sm_y][threadIdx.x];
-        register double tz_reg  = den_shared[2][sm_y][threadIdx.x];
-        // Warp blocks are stored col major
-        register double tau_reg = 0.0;
-        tau_reg  = 0.5 * cuda::warp_reduce_sum<warp_size>( tx_reg );
-        tau_reg += 0.5 * cuda::warp_reduce_sum<warp_size>( ty_reg );
-        tau_reg += 0.5 * cuda::warp_reduce_sum<warp_size>( tz_reg );
-
-        register double lapl_reg = 0.0;
-        if constexpr (need_lapl) {
-          lapl_reg = den_shared[3][sm_y][threadIdx.x];
-          lapl_reg = cuda::warp_reduce_sum<warp_size>(lapl_reg);
-          lapl_reg = 2. * lapl_reg + 4. * tau_reg;
-        }
 
-        if( threadIdx.x == 0 and tid_y < npts ) {
-          atomicAdd( tau_eval_device   + tid_y, tau_reg );
-          if constexpr (need_lapl) {
-            atomicAdd( lapl_eval_device   + tid_y, lapl_reg );
-          }
-        }
-      }
-      __syncthreads();
-    }
-  }
-}
+#include "uvvars_lda.hpp"
+#include "uvvars_gga.hpp"
+#include "uvvars_mgga.hpp"
 
+namespace GauXC {
 
 #define EVAL_UVARS_KERNEL(xc_approx) \
   cudaStream_t stream = queue.queue_as<util::cuda_stream>();  \
-  dim3 blocks( util::div_ceil( npts_max,  threads.x ),  \
-               1, \
-               ntasks ); \
   switch ( ks_scheme ) { \
     case RKS: \
       eval_uvars_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
@@ -428,264 +32,197 @@ __global__ void eval_uvars_mgga_rks_kernel( size_t           ntasks,
       eval_uvars_##xc_approx##_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
       break; \
     default: \
-      GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate UV vars" ); \
+      GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate U vars" ); \
   } 
 
+  
+#define EVAL_TMAT_KERNEL(xc_approx) \
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();  \
+  switch ( ks_scheme ) { \
+    case RKS: \
+      eval_tmat_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks); \
+      break; \
+    case UKS: \
+      eval_tmat_##xc_approx##_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks); \
+      break; \
+    case GKS: \
+      GAUXC_GENERIC_EXCEPTION( "GKS + evaluate trial U vars NYI" ); \
+      break; \
+    default: \
+      GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate U vars" ); \
+  } 
+
+
+#define EVAL_VVARS_KERNEL(xc_approx) \
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();  \
+  switch ( den_select ) { \
+    case DEN_S: \
+      eval_vvar_##xc_approx##_kern<trial,DEN_S><<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    case DEN_Z: \
+      eval_vvar_##xc_approx##_kern<trial,DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    case DEN_Y: \
+      eval_vvar_##xc_approx##_kern<trial,DEN_Y><<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    case DEN_X: \
+      eval_vvar_##xc_approx##_kern<trial,DEN_X><<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \
+      break; \
+    default: \
+      GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate V vars" ); \
+  }
+
+// Internal implementation with trial parameter
+void eval_tmat_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 );
+  dim3 blocks( util::div_ceil( npts_max,  threads.x ), 1, ntasks ); 
+  EVAL_TMAT_KERNEL(lda);
+}
+
 void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
   XCDeviceTask* device_tasks, device_queue queue ) {
   dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 );
+  dim3 blocks( util::div_ceil( npts_max,  threads.x ), 1, ntasks ); 
   EVAL_UVARS_KERNEL(lda);
 }
 
+// Internal implementation with trial as template parameter
+template<bool trial>
+void eval_vvars_lda_impl( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 );
+  dim3 blocks( util::div_ceil( nbf_max,  threads.x ),
+               util::div_ceil( npts_max, threads.y ),
+               ntasks );
+  EVAL_VVARS_KERNEL(lda);
+}
+void eval_vvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  eval_vvars_lda_impl<false>(ntasks, nbf_max, npts_max, den_select, device_tasks, queue);
+}
+void eval_vvars_lda_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  eval_vvars_lda_impl<true>(ntasks, nbf_max, npts_max, den_select, device_tasks, queue);
+}
 
-
+// Internal implementation with trial parameter
+void eval_tmat_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 );
+  dim3 blocks( util::div_ceil( npts_max,  threads.x ), 1, ntasks ); 
+  EVAL_TMAT_KERNEL(gga);
+}
 void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
   XCDeviceTask* device_tasks, device_queue queue ) {
   dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 );
+  dim3 blocks( util::div_ceil( npts_max,  threads.x ), 1, ntasks ); 
   EVAL_UVARS_KERNEL(gga);
 }
 
 
-
-void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
-  int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, 
-  device_queue queue ) {
-  // TODO: This interface should be unified with the lda/gga interfaces
-  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
-
-  // U Variables
-  {
-  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
-  dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+// Internal implementation with trial as template parameter
+template<bool trial>
+void eval_vvars_gga_impl( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 );
+  dim3 blocks( util::div_ceil( nbf_max,  threads.x ),
+               util::div_ceil( npts_max, threads.y ),
                ntasks );
-  if(do_lapl)
-    eval_uvars_mgga_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-  else
-    eval_uvars_mgga_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-  }
-
-  // V variables (GAMMA)
-  dim3 threads( cuda::max_threads_per_thread_block );
-  dim3 blocks( util::div_ceil( npts_total,  threads.x ),  
-               1, 
-               ntasks ); 
-  eval_uvars_gga_rks_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+  EVAL_VVARS_KERNEL(gga);
+}
+void eval_vvars_gga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  eval_vvars_gga_impl<false>(ntasks, nbf_max, npts_max, den_select, device_tasks, queue);
+}
+void eval_vvars_gga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  XCDeviceTask* device_tasks, device_queue queue ) {
+  eval_vvars_gga_impl<true>(ntasks, nbf_max, npts_max, den_select, device_tasks, queue);
 }
 
+// Internal implementation with trial parameter
+void eval_tmat_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) {
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>(); 
 
+  dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 );
+  dim3 blocks( util::div_ceil( npts_max,  threads.x ), 1, ntasks ); 
 
-
-
-
-
-template <density_id den_select>
-__global__ void eval_vvar_grad_kern( size_t        ntasks,
-                                       XCDeviceTask* tasks_device ) {
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  auto& task = tasks_device[ batch_idx ];
-
-  const auto npts            = task.npts;
-  const auto nbf             = task.bfn_screening.nbe;
-
-  double* den_eval_device   = nullptr;
-  double* den_x_eval_device = nullptr;
-  double* den_y_eval_device = nullptr;
-  double* den_z_eval_device = nullptr;
-
-  constexpr auto warp_size = cuda::warp_size;
-
-  if constexpr (den_select == DEN_S) {
-    den_eval_device   = task.den_s;
-    den_x_eval_device = task.dden_sx;
-    den_y_eval_device = task.dden_sy;
-    den_z_eval_device = task.dden_sz;
+  if(need_lapl) {
+    GAUXC_GENERIC_EXCEPTION("MGGA + LAPL + eval tmat NYI");
   }
-  if constexpr (den_select == DEN_Z) {
-    den_eval_device   = task.den_z;
-    den_x_eval_device = task.dden_zx;
-    den_y_eval_device = task.dden_zy;
-    den_z_eval_device = task.dden_zz;
-  }
-  if constexpr (den_select == DEN_Y) {
-    den_eval_device   = task.den_y;
-    den_x_eval_device = task.dden_yx;
-    den_y_eval_device = task.dden_yy;
-    den_z_eval_device = task.dden_yz;
-  }
-  if constexpr (den_select == DEN_X) {
-    den_eval_device   = task.den_x;
-    den_x_eval_device = task.dden_xx;
-    den_y_eval_device = task.dden_xy;
-    den_z_eval_device = task.dden_xz;
-  }
-
-  const auto* basis_eval_device = task.bf;
-  const auto* dbasis_x_eval_device = task.dbfx;
-  const auto* dbasis_y_eval_device = task.dbfy;
-  const auto* dbasis_z_eval_device = task.dbfz;
-
-  const auto* den_basis_prod_device = task.zmat;
-  
-  __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1];
-
-  for ( int bid_x = blockIdx.x * blockDim.x; 
-        bid_x < nbf;
-        bid_x += blockDim.x * gridDim.x ) {
-    
-    for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; 
-          bid_y < npts;
-          bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) {
-        
-      for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
-        den_shared[0][threadIdx.x][sm_y] = 0.;
-        den_shared[1][threadIdx.x][sm_y] = 0.;
-        den_shared[2][threadIdx.x][sm_y] = 0.;
-        den_shared[3][threadIdx.x][sm_y] = 0.;
-
-        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
-          const double* db_col   = den_basis_prod_device + (bid_x + sm_y)*npts;
-          const double* bf_col   = basis_eval_device     + (bid_x + sm_y)*npts;
-          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
-          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
-          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
-
-          den_shared[0][threadIdx.x][sm_y] = bf_col  [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-          den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
-        }
-      }
-      __syncthreads();
-
-
-      for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
-        const int tid_y = bid_y + sm_y;
-        register double den_reg = den_shared[0][sm_y][threadIdx.x];
-        register double dx_reg  = den_shared[1][sm_y][threadIdx.x];
-        register double dy_reg  = den_shared[2][sm_y][threadIdx.x];
-        register double dz_reg  = den_shared[3][sm_y][threadIdx.x];
-
-        // Warp blocks are stored col major
-        den_reg =     cuda::warp_reduce_sum<warp_size>( den_reg );
-        dx_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dx_reg );
-        dy_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dy_reg );
-        dz_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dz_reg );
-
-
-        if( threadIdx.x == 0 and tid_y < npts ) {
-          atomicAdd( den_eval_device   + tid_y, den_reg );
-          atomicAdd( den_x_eval_device + tid_y, dx_reg  );
-          atomicAdd( den_y_eval_device + tid_y, dy_reg  );
-          atomicAdd( den_z_eval_device + tid_y, dz_reg  );
-        }
-      }
-      __syncthreads();
-    }
+  if(ks_scheme == RKS) {
+      eval_tmat_mgga_rks_kernel<<<blocks, threads, 0, stream>>>(ntasks, device_tasks);
+  } else if(ks_scheme == UKS) {
+      eval_tmat_mgga_uks_kernel<<<blocks, threads, 0, stream>>>(ntasks, device_tasks);
+  } else {
+    GAUXC_GENERIC_EXCEPTION("GKS + MGGA + DEVICE NYI");
   }
-
 }
 
+void eval_uvars_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) {
 
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>(); 
 
-template <density_id den_select>
-__global__ void eval_vvar_kern( size_t        ntasks,
-                                       XCDeviceTask* tasks_device ) {
-
-  const int batch_idx = blockIdx.z;
-  if( batch_idx >= ntasks ) return;
-
-  auto& task = tasks_device[ batch_idx ];
-
-  const auto npts            = task.npts;
-  const auto nbf             = task.bfn_screening.nbe;
-
-  double* den_eval_device   = nullptr;
-  // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored
-  if constexpr (den_select == DEN_S) den_eval_device = task.den_s;
-  if constexpr (den_select == DEN_Z) den_eval_device = task.den_z;
-  if constexpr (den_select == DEN_Y) den_eval_device = task.den_y;
-  if constexpr (den_select == DEN_X) den_eval_device = task.den_x;
-
-  const auto* basis_eval_device = task.bf;
-
-  const auto* den_basis_prod_device = task.zmat;
+  // Evaluate GAMMA
+  eval_uvars_gga(ntasks, npts_max, ks_scheme, device_tasks, queue);
 
-  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-
-  register double den_reg = 0.;
-
-  if( tid_x < nbf and tid_y < npts ) {
-
-    const double* bf_col   = basis_eval_device     + tid_x*npts;
-    const double* db_col   = den_basis_prod_device + tid_x*npts;
-
-    den_reg = bf_col[ tid_y ]   * db_col[ tid_y ];
-
-  }
-
-  // Warp blocks are stored col major
-  constexpr auto warp_size = cuda::warp_size;
-  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
-  den_reg = cuda::warp_reduce_sum<warp_size>( den_reg );
-
-
-  if( threadIdx.x == 0 and tid_y < npts ) {
-    atomicAdd( den_eval_device   + tid_y, den_reg );
+  if(ks_scheme == RKS) {
+    return; // Nothing left to do
+  } else if(ks_scheme == UKS) {
+    dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 );
+    dim3 blocks( util::div_ceil( npts_max,  threads.x ), 1, ntasks ); 
+    if(need_lapl) {
+      eval_uvars_mgga_uks_kernel<true><<<blocks, threads, 0, stream>>>(ntasks, device_tasks);
+    } else {
+      eval_uvars_mgga_uks_kernel<false><<<blocks, threads, 0, stream>>>(ntasks, device_tasks);
+    }
+  } else {
+    GAUXC_GENERIC_EXCEPTION("GKS + MGGA + DEVICE NYI");
   }
-  
 
 }
 
+// Internal implementation with trial as template parameter
+template<bool trial>
+void eval_vvars_mgga_impl( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) {
+  // First evaluate GGA variables
+  eval_vvars_gga_impl<trial>(ntasks, nbf_max, npts_max, den_select, device_tasks, queue);
 
-
-
-void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select,
-  XCDeviceTask* device_tasks, device_queue queue ) {
+  dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 );
+  dim3 blocks( util::div_ceil( nbf_max,  threads.x ),
+               util::div_ceil( npts_max, threads.y ),
+               ntasks );
 
   cudaStream_t stream = queue.queue_as<util::cuda_stream>();
-  dim3 threads;
-  dim3 blocks;
-  if( do_grad ) {
-    threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
-    blocks = dim3( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-            std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
-            ntasks );
-  } else {
-    threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block, 1 );
-    blocks = dim3( util::div_ceil( nbf_max,  threads.x ),
-            util::div_ceil( npts_max, threads.y ),
-            ntasks );
-  }
-  switch( den_select ) {
-    case DEN_S: 
-      if (do_grad)  eval_vvar_grad_kern<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-      else          eval_vvar_kern<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-      break;
-    case DEN_Z: 
-      if (do_grad)  eval_vvar_grad_kern<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-      else          eval_vvar_kern<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-      break;
-    case DEN_Y: 
-      if (do_grad)  eval_vvar_grad_kern<DEN_Y><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-      else          eval_vvar_kern<DEN_Y><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+  switch ( den_select ) {
+    case DEN_S:
+      if (need_lapl) {
+        eval_vvar_mgga_kern<trial,DEN_S,true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); 
+      } else {
+        eval_vvar_mgga_kern<trial,DEN_S,false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      }
       break;
-    case DEN_X: 
-      if (do_grad)  eval_vvar_grad_kern<DEN_X><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
-      else          eval_vvar_kern<DEN_X><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+    case DEN_Z:
+      if (need_lapl) {
+        eval_vvar_mgga_kern<trial,DEN_Z,true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); 
+      } else {
+        eval_vvar_mgga_kern<trial,DEN_Z,false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+      }
       break;
     default:
-      GAUXC_GENERIC_EXCEPTION( "eval_vvar called with improper density selected" );
+      GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate V vars" );
   }
-
 }
-
-
-
-
+void eval_vvars_mgga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) {
+  eval_vvars_mgga_impl<false>(ntasks, nbf_max, npts_max, den_select, need_lapl, device_tasks, queue);
+}
+void eval_vvars_mgga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select,
+  bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) {
+  eval_vvars_mgga_impl<true>(ntasks, nbf_max, npts_max, den_select, need_lapl, device_tasks, queue);
+}
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_gga.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_gga.hpp
new file mode 100644
index 00000000..9b466e24
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_gga.hpp
@@ -0,0 +1,555 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "device_specific/cuda_device_constants.hpp"
+#include "device_specific/cuda_util.hpp"
+#include "device/xc_device_data.hpp"
+
+#define VVAR_KERNEL_SM_BLOCK 32
+#define GGA_KERNEL_SM_WARPS 16
+
+namespace GauXC {
+
+template <bool trial, density_id den_select>
+__global__ void eval_vvar_gga_kern( size_t        ntasks,
+                                    XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  double* den_eval_device   = nullptr;
+  double* den_x_eval_device = nullptr;
+  double* den_y_eval_device = nullptr;
+  double* den_z_eval_device = nullptr;
+
+  constexpr auto warp_size = cuda::warp_size;
+
+  if constexpr (trial){
+    if constexpr (den_select == DEN_S) {
+      den_eval_device   = task.tden_s;
+      den_x_eval_device = task.tdden_sx;
+      den_y_eval_device = task.tdden_sy;
+      den_z_eval_device = task.tdden_sz;
+    }
+    if constexpr (den_select == DEN_Z) {
+      den_eval_device   = task.tden_z;
+      den_x_eval_device = task.tdden_zx;
+      den_y_eval_device = task.tdden_zy;
+      den_z_eval_device = task.tdden_zz;
+    }
+    if constexpr (den_select == DEN_Y) {
+      den_eval_device   = task.tden_y;
+      den_x_eval_device = task.tdden_yx;
+      den_y_eval_device = task.tdden_yy;
+      den_z_eval_device = task.tdden_yz;
+    }
+    if constexpr (den_select == DEN_X) {
+      den_eval_device   = task.tden_x;
+      den_x_eval_device = task.tdden_xx;
+      den_y_eval_device = task.tdden_xy;
+      den_z_eval_device = task.tdden_xz;
+    }
+  }else{
+    if constexpr (den_select == DEN_S) {
+      den_eval_device   = task.den_s;
+      den_x_eval_device = task.dden_sx;
+      den_y_eval_device = task.dden_sy;
+      den_z_eval_device = task.dden_sz;
+    }
+    if constexpr (den_select == DEN_Z) {
+      den_eval_device   = task.den_z;
+      den_x_eval_device = task.dden_zx;
+      den_y_eval_device = task.dden_zy;
+      den_z_eval_device = task.dden_zz;
+    }
+    if constexpr (den_select == DEN_Y) {
+      den_eval_device   = task.den_y;
+      den_x_eval_device = task.dden_yx;
+      den_y_eval_device = task.dden_yy;
+      den_z_eval_device = task.dden_yz;
+    }
+    if constexpr (den_select == DEN_X) {
+      den_eval_device   = task.den_x;
+      den_x_eval_device = task.dden_xx;
+      den_y_eval_device = task.dden_xy;
+      den_z_eval_device = task.dden_xz;
+    }
+  }
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  const auto* den_basis_prod_device = task.zmat;
+  
+  __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1];
+
+  for ( int bid_x = blockIdx.x * blockDim.x; 
+        bid_x < nbf;
+        bid_x += blockDim.x * gridDim.x ) {
+    
+    for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; 
+          bid_y < npts;
+          bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) {
+        
+      for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
+        den_shared[0][threadIdx.x][sm_y] = 0.;
+        den_shared[1][threadIdx.x][sm_y] = 0.;
+        den_shared[2][threadIdx.x][sm_y] = 0.;
+        den_shared[3][threadIdx.x][sm_y] = 0.;
+
+        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
+          const double* db_col   = den_basis_prod_device + (bid_x + sm_y)*npts;
+          const double* bf_col   = basis_eval_device     + (bid_x + sm_y)*npts;
+          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
+
+          den_shared[0][threadIdx.x][sm_y] = bf_col  [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+        }
+      }
+      __syncthreads();
+
+
+      for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
+        const int tid_y = bid_y + sm_y;
+        register double den_reg = den_shared[0][sm_y][threadIdx.x];
+        register double dx_reg  = den_shared[1][sm_y][threadIdx.x];
+        register double dy_reg  = den_shared[2][sm_y][threadIdx.x];
+        register double dz_reg  = den_shared[3][sm_y][threadIdx.x];
+
+        // Warp blocks are stored col major
+        den_reg =     cuda::warp_reduce_sum<warp_size>( den_reg );
+        dx_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dx_reg );
+        dy_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dy_reg );
+        dz_reg  = 2. * cuda::warp_reduce_sum<warp_size>( dz_reg );
+
+
+        if( threadIdx.x == 0 and tid_y < npts ) {
+          atomicAdd( den_eval_device   + tid_y, den_reg );
+          atomicAdd( den_x_eval_device + tid_y, dx_reg  );
+          atomicAdd( den_y_eval_device + tid_y, dy_reg  );
+          atomicAdd( den_z_eval_device + tid_y, dz_reg  );
+        }
+      }
+      __syncthreads();
+    }
+  }
+
+}
+
+__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+  
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts  = task.npts;
+  
+  const auto*   dden_sx_eval_device = task.dden_sx;
+  const auto*   dden_sy_eval_device = task.dden_sy;
+  const auto*   dden_sz_eval_device = task.dden_sz;
+  auto*         gamma_eval_device   = task.gamma;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if( tid < npts ) {
+    const double dx = dden_sx_eval_device[ tid ];
+    const double dy = dden_sy_eval_device[ tid ];
+    const double dz = dden_sz_eval_device[ tid ];
+
+    gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz;
+  }
+
+}
+
+__global__ void eval_tmat_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+  
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts  = task.npts;
+  
+  const auto*   dden_sx_eval_device = task.dden_sx;
+  const auto*   dden_sy_eval_device = task.dden_sy;
+  const auto*   dden_sz_eval_device = task.dden_sz;
+  const auto*   tdden_sx_eval_device = task.tdden_sx;
+  const auto*   tdden_sy_eval_device = task.tdden_sy;
+  const auto*   tdden_sz_eval_device = task.tdden_sz;
+
+  const auto* weight_device  = task.weights;
+  const auto* vgamma_device  = task.vgamma;
+  const auto* v2rho2_device     = task.v2rho2;
+  const auto* v2rhogamma_device = task.v2rhogamma;
+  const auto* v2gamma2_device   = task.v2gamma2;
+  const auto* trho_device       = task.tden_s;
+
+  auto* FXC_A_device   = task.FXC_A_s;
+  auto* FXC_Bx_device   = task.FXC_Bx_s;
+  auto* FXC_By_device   = task.FXC_By_s;
+  auto* FXC_Bz_device   = task.FXC_Bz_s;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if( tid < npts ) {
+    const auto dx = dden_sx_eval_device[ tid ];
+    const auto dy = dden_sy_eval_device[ tid ];
+    const auto dz = dden_sz_eval_device[ tid ];
+    const auto tdx = tdden_sx_eval_device[ tid ];
+    const auto tdy = tdden_sy_eval_device[ tid ];
+    const auto tdz = tdden_sz_eval_device[ tid ];
+    const auto tgamma = tdx*dx + tdy*dy + tdz*dz;
+
+    const auto FXC_A = v2rho2_device[ tid ] * trho_device[ tid ] + 2.0 * v2rhogamma_device[tid] * tgamma;
+    const auto B_coef = v2rhogamma_device[tid] * trho_device[tid] + 2.0 * v2gamma2_device[tid] * tgamma;
+    FXC_A_device[ tid ]  = weight_device[ tid ] * FXC_A ;
+    FXC_Bx_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dx + vgamma_device[ tid ] * tdx );
+    FXC_By_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dy + vgamma_device[ tid ] * tdy );
+    FXC_Bz_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dz + vgamma_device[ tid ] * tdz );
+  }
+
+}
+
+__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+
+  auto*           den_pos_eval_device   = task.den_s;
+  const auto*     den_pos_x_eval_device = task.dden_sx;
+  const auto*     den_pos_y_eval_device = task.dden_sy;
+  const auto*     den_pos_z_eval_device = task.dden_sz;
+
+  auto*           den_neg_eval_device   = task.den_z;
+  const auto*     den_neg_x_eval_device = task.dden_zx;
+  const auto*     den_neg_y_eval_device = task.dden_zy;
+  const auto*     den_neg_z_eval_device = task.dden_zz;
+
+  auto*     gamma_pp_eval_device  = task.gamma_pp;
+  auto*     gamma_pm_eval_device  = task.gamma_pm;
+  auto*     gamma_mm_eval_device  = task.gamma_mm;
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const double ps     = den_pos_eval_device[ tid ];
+    const double pz     = den_neg_eval_device[ tid ];
+    const double dndx   = den_pos_x_eval_device[ tid ];
+    const double dndy   = den_pos_y_eval_device[ tid ];
+    const double dndz   = den_pos_z_eval_device[ tid ];
+    const double dMzdx  = den_neg_x_eval_device[ tid ];
+    const double dMzdy  = den_neg_y_eval_device[ tid ];
+    const double dMzdz  = den_neg_z_eval_device[ tid ];
+
+    // (del n).(del n)
+    const auto dn_sq  = dndx*dndx + dndy*dndy + dndz*dndz;
+    // (del Mz).(del Mz)
+    const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz;
+    // (del n).(del Mz)
+    const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz;
+
+    gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz;
+    gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq);
+    gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz;
+
+    den_pos_eval_device[ tid ] = 0.5*(ps + pz);
+    den_neg_eval_device[ tid ] = 0.5*(ps - pz);
+  }
+
+}
+
+__global__ void eval_tmat_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+
+  const auto* tden_s_device   = task.tden_s;
+  const auto* tden_z_device   = task.tden_z;
+  const auto* weight_device   = task.weights;
+
+  const auto*     tden_pos_x_eval_device = task.tdden_sx;
+  const auto*     tden_pos_y_eval_device = task.tdden_sy;
+  const auto*     tden_pos_z_eval_device = task.tdden_sz;
+  const auto*     den_pos_x_eval_device = task.dden_sx;
+  const auto*     den_pos_y_eval_device = task.dden_sy;
+  const auto*     den_pos_z_eval_device = task.dden_sz;
+
+  const auto*     tden_neg_x_eval_device = task.tdden_zx;
+  const auto*     tden_neg_y_eval_device = task.tdden_zy;
+  const auto*     tden_neg_z_eval_device = task.tdden_zz;
+  const auto*     den_neg_x_eval_device = task.dden_zx;
+  const auto*     den_neg_y_eval_device = task.dden_zy;
+  const auto*     den_neg_z_eval_device = task.dden_zz;
+
+  const auto* vgamma_aa_device   = task.vgamma_pp;
+  const auto* vgamma_ab_device   = task.vgamma_pm;
+  const auto* vgamma_bb_device   = task.vgamma_mm;
+  const auto* v2rho2_a_a_device    = task.v2rho2_a_a;
+  const auto* v2rho2_a_b_device    = task.v2rho2_a_b;
+  const auto* v2rho2_b_b_device    = task.v2rho2_b_b;
+  const auto* v2rhogamma_a_aa_device = task.v2rhogamma_a_aa;
+  const auto* v2rhogamma_a_ab_device = task.v2rhogamma_a_ab;
+  const auto* v2rhogamma_a_bb_device = task.v2rhogamma_a_bb;
+  const auto* v2rhogamma_b_aa_device = task.v2rhogamma_b_aa;
+  const auto* v2rhogamma_b_ab_device = task.v2rhogamma_b_ab;
+  const auto* v2rhogamma_b_bb_device = task.v2rhogamma_b_bb;
+  const auto* v2gamma2_aa_aa_device = task.v2gamma2_aa_aa;
+  const auto* v2gamma2_aa_ab_device = task.v2gamma2_aa_ab;
+  const auto* v2gamma2_aa_bb_device = task.v2gamma2_aa_bb;
+  const auto* v2gamma2_ab_ab_device = task.v2gamma2_ab_ab;
+  const auto* v2gamma2_ab_bb_device = task.v2gamma2_ab_bb;
+  const auto* v2gamma2_bb_bb_device = task.v2gamma2_bb_bb;
+
+  auto* FXC_A_s_device        = task.FXC_A_s;
+  auto* FXC_A_z_device        = task.FXC_A_z;
+  auto* FXC_Bx_s_device       = task.FXC_Bx_s;
+  auto* FXC_Bx_z_device       = task.FXC_Bx_z;
+  auto* FXC_By_s_device       = task.FXC_By_s;
+  auto* FXC_By_z_device       = task.FXC_By_z;
+  auto* FXC_Bz_s_device       = task.FXC_Bz_s;
+  auto* FXC_Bz_z_device       = task.FXC_Bz_z;
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const auto ps = tden_s_device[ tid ];
+    const auto pz = tden_z_device[ tid ];
+    const auto trho_a_device = 0.5*(ps + pz);
+    const auto trho_b_device = 0.5*(ps - pz);
+
+    const auto tdndx   = tden_pos_x_eval_device[ tid ];
+    const auto tdndy   = tden_pos_y_eval_device[ tid ];
+    const auto tdndz   = tden_pos_z_eval_device[ tid ];
+    const auto tdMzdx  = tden_neg_x_eval_device[ tid ];
+    const auto tdMzdy  = tden_neg_y_eval_device[ tid ];
+    const auto tdMzdz  = tden_neg_z_eval_device[ tid ];
+    const auto tdden_a_x = 0.5*(tdndx + tdMzdx);
+    const auto tdden_a_y = 0.5*(tdndy + tdMzdy);
+    const auto tdden_a_z = 0.5*(tdndz + tdMzdz);
+    const auto tdden_b_x = 0.5*(tdndx - tdMzdx);
+    const auto tdden_b_y = 0.5*(tdndy - tdMzdy);
+    const auto tdden_b_z = 0.5*(tdndz - tdMzdz);
+
+    const auto dndx   = den_pos_x_eval_device[ tid ];
+    const auto dndy   = den_pos_y_eval_device[ tid ];
+    const auto dndz   = den_pos_z_eval_device[ tid ];
+    const auto dMzdx  = den_neg_x_eval_device[ tid ];
+    const auto dMzdy  = den_neg_y_eval_device[ tid ];
+    const auto dMzdz  = den_neg_z_eval_device[ tid ];
+    const auto dden_a_x = 0.5*(dndx + dMzdx);
+    const auto dden_a_y = 0.5*(dndy + dMzdy);
+    const auto dden_a_z = 0.5*(dndz + dMzdz);
+    const auto dden_b_x = 0.5*(dndx - dMzdx);
+    const auto dden_b_y = 0.5*(dndy - dMzdy);
+    const auto dden_b_z = 0.5*(dndz - dMzdz);
+
+    const auto tgamma_pp = tdden_a_x * dden_a_x + tdden_a_y * dden_a_y + tdden_a_z * dden_a_z;
+    const auto tgamma_pm = tdden_a_x * dden_b_x + tdden_a_y * dden_b_y + tdden_a_z * dden_b_z
+                                 + tdden_b_x * dden_a_x + tdden_b_y * dden_a_y + tdden_b_z * dden_a_z;
+    const auto tgamma_mm = tdden_b_x * dden_b_x + tdden_b_y * dden_b_y + tdden_b_z * dden_b_z;
+
+    
+    const auto A_a = v2rho2_a_a_device[tid] * trho_a_device + 2.0 * v2rhogamma_a_aa_device[tid] * tgamma_pp + 
+          v2rhogamma_a_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_a_bb_device[tid] * tgamma_mm + 
+          v2rho2_a_b_device[tid] * trho_b_device;
+    const auto A_b = v2rho2_b_b_device[tid] * trho_b_device + 2.0 * v2rhogamma_b_bb_device[tid] * tgamma_mm + 
+          v2rhogamma_b_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_b_aa_device[tid] * tgamma_pp + 
+          v2rho2_a_b_device[tid] * trho_a_device;
+    FXC_A_s_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a + A_b);
+    FXC_A_z_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a - A_b);
+    // Calculate B coefficients for alpha spin
+    const double B_coef1_a = v2rhogamma_a_aa_device[tid] * trho_a_device   + 2.0 * v2gamma2_aa_aa_device[tid] * tgamma_pp + 
+                 v2gamma2_aa_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_mm + 
+                 v2rhogamma_b_aa_device[tid] * trho_b_device;
+
+    const double B_coef2_a = v2rhogamma_a_ab_device[tid] * trho_a_device + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + 
+          v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm +
+          v2rhogamma_b_ab_device[tid] * trho_b_device;
+
+    // Calculate gradient components for alpha spin
+    const double Bx_a = 2.0 * B_coef1_a * dden_a_x + B_coef2_a * dden_b_x + 
+           2.0 * vgamma_aa_device[tid] * tdden_a_x + vgamma_ab_device[tid] * tdden_b_x;
+
+    const double By_a = 2.0 * B_coef1_a * dden_a_y + B_coef2_a * dden_b_y + 
+           2.0 * vgamma_aa_device[tid] * tdden_a_y + vgamma_ab_device[tid] * tdden_b_y;
+
+    const double Bz_a = 2.0 * B_coef1_a * dden_a_z + B_coef2_a * dden_b_z + 
+           2.0 * vgamma_aa_device[tid] * tdden_a_z + vgamma_ab_device[tid] * tdden_b_z;
+
+    // Calculate B coefficients for beta spin
+    const double B_coef1_b = v2rhogamma_b_bb_device[tid] * trho_b_device + 2.0 * v2gamma2_bb_bb_device[tid] * tgamma_mm + 
+          v2gamma2_ab_bb_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_pp + 
+          v2rhogamma_a_bb_device[tid] * trho_a_device;
+
+    const double B_coef2_b = v2rhogamma_b_ab_device[tid] * trho_b_device + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm + 
+          v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + 
+          v2rhogamma_a_ab_device[tid] * trho_a_device;
+
+    const double Bx_b = 2.0 * B_coef1_b * dden_b_x + B_coef2_b * dden_a_x + 
+           2.0 * vgamma_bb_device[tid] * tdden_b_x + vgamma_ab_device[tid] * tdden_a_x;
+
+    const double By_b = 2.0 * B_coef1_b * dden_b_y + B_coef2_b * dden_a_y + 
+           2.0 * vgamma_bb_device[tid] * tdden_b_y + vgamma_ab_device[tid] * tdden_a_y;
+
+    const double Bz_b = 2.0 * B_coef1_b * dden_b_z + B_coef2_b * dden_a_z + 
+           2.0 * vgamma_bb_device[tid] * tdden_b_z + vgamma_ab_device[tid] * tdden_a_z;
+
+    FXC_Bx_s_device[tid] = 0.5 * weight_device[tid] * (Bx_a + Bx_b);
+    FXC_By_s_device[tid] = 0.5 * weight_device[tid] * (By_a + By_b);
+    FXC_Bz_s_device[tid] = 0.5 * weight_device[tid] * (Bz_a + Bz_b);
+    FXC_Bx_z_device[tid] = 0.5 * weight_device[tid] * (Bx_a - Bx_b);
+    FXC_By_z_device[tid] = 0.5 * weight_device[tid] * (By_a - By_b);
+    FXC_Bz_z_device[tid] = 0.5 * weight_device[tid] * (Bz_a - Bz_b);
+
+
+  }
+
+}
+
+__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+
+        auto*     den_s_eval_device   = task.den_s;
+  const auto*     dden_sx_eval_device = task.dden_sx;
+  const auto*     dden_sy_eval_device = task.dden_sy;
+  const auto*     dden_sz_eval_device = task.dden_sz;
+
+        auto*     den_z_eval_device   = task.den_z;
+  const auto*     dden_zx_eval_device = task.dden_zx;
+  const auto*     dden_zy_eval_device = task.dden_zy;
+  const auto*     dden_zz_eval_device = task.dden_zz;
+
+  const auto*     den_y_eval_device   = task.den_y;
+  const auto*     dden_yx_eval_device = task.dden_yx;
+  const auto*     dden_yy_eval_device = task.dden_yy;
+  const auto*     dden_yz_eval_device = task.dden_yz;
+
+  const auto*     den_x_eval_device   = task.den_x;
+  const auto*     dden_xx_eval_device = task.dden_xx;
+  const auto*     dden_xy_eval_device = task.dden_xy;
+  const auto*     dden_xz_eval_device = task.dden_xz;
+
+  auto*     gamma_pp_eval_device  = task.gamma_pp;
+  auto*     gamma_pm_eval_device  = task.gamma_pm;
+  auto*     gamma_mm_eval_device  = task.gamma_mm;
+
+  auto*     H_z_eval_device = task.H_z;
+  auto*     H_y_eval_device = task.H_y;
+  auto*     H_x_eval_device = task.H_x;
+  auto*     K_z_eval_device = task.K_z;
+  auto*     K_y_eval_device = task.K_y;
+  auto*     K_x_eval_device = task.K_x;
+
+  const double dtolsq = 1e-24;  // TODO: make variable
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const double dndz = dden_sz_eval_device[ tid ];
+    const double dndy = dden_sy_eval_device[ tid ];
+    const double dndx = dden_sx_eval_device[ tid ];
+
+    const double dMzdz = dden_zz_eval_device[ tid ];
+    const double dMzdy = dden_zy_eval_device[ tid ];
+    const double dMzdx = dden_zx_eval_device[ tid ];
+
+    const double dMydz = dden_yz_eval_device[ tid ];
+    const double dMydy = dden_yy_eval_device[ tid ];
+    const double dMydx = dden_yx_eval_device[ tid ];
+
+    const double dMxdz = dden_xz_eval_device[ tid ];
+    const double dMxdy = dden_xy_eval_device[ tid ];
+    const double dMxdx = dden_xx_eval_device[ tid ];
+
+    const auto ps = den_s_eval_device[ tid ];
+    const auto pz = den_z_eval_device[ tid ];
+    const auto py = den_y_eval_device[ tid ];
+    const auto px = den_x_eval_device[ tid ];
+
+    const auto mtemp = pz*pz + px*px + py*py;
+    double mnorm = 0.;
+
+    const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz;
+    const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz;
+    const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz;
+    const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz;
+
+    const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz;
+    const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz;
+    const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz;
+
+    const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely;
+    const auto s_sum =
+               dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py;
+
+    const auto inv_sqsum2 =
+        rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx +
+             dels_dot_dely * dels_dot_dely);
+    const auto sqsum2 = 1./inv_sqsum2;
+
+    double sign = 1.;
+    if( signbit(s_sum)) 
+      sign = -1.;
+
+
+    if (mtemp > dtolsq) {
+      const double inv_mnorm = rsqrt(mtemp);
+      mnorm = 1./inv_mnorm;
+      K_z_eval_device[ tid ] = pz * inv_mnorm;
+      K_y_eval_device[ tid ] = py * inv_mnorm;
+      K_x_eval_device[ tid ] = px * inv_mnorm;
+      H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2;
+      H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2;
+      H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2;
+    }
+    else {
+      mnorm = (1. / 3.) * (px + py + pz);
+      K_z_eval_device[ tid ] = 1. / 3.;
+      K_y_eval_device[ tid ] = 1. / 3.;
+      K_x_eval_device[ tid ] = 1. / 3.;
+
+      H_z_eval_device[ tid ] = sign / 3.;
+      H_y_eval_device[ tid ] = sign / 3.;
+      H_x_eval_device[ tid ] = sign / 3.;
+    }
+
+    gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2;
+    gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum);
+    gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2;
+
+    den_s_eval_device[ tid ] = 0.5*(ps + mnorm);
+    den_z_eval_device[ tid ] = 0.5*(ps - mnorm);
+
+  }
+
+}
+
+} // namespace GauXC
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_lda.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_lda.hpp
new file mode 100644
index 00000000..54dc5043
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_lda.hpp
@@ -0,0 +1,208 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "device_specific/cuda_device_constants.hpp"
+#include "device_specific/cuda_util.hpp"
+#include "device/xc_device_data.hpp"
+
+namespace GauXC {
+
+template <bool trial, density_id den_select>
+__global__ void eval_vvar_lda_kern( size_t        ntasks,
+                                    XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  double* den_eval_device   = nullptr;
+  // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored
+  if constexpr (trial){
+    if constexpr (den_select == DEN_S) den_eval_device = task.tden_s;
+    if constexpr (den_select == DEN_Z) den_eval_device = task.tden_z;
+    if constexpr (den_select == DEN_Y) den_eval_device = task.tden_y;
+    if constexpr (den_select == DEN_X) den_eval_device = task.tden_x;
+  }else{
+      if constexpr (den_select == DEN_S) den_eval_device = task.den_s;
+      if constexpr (den_select == DEN_Z) den_eval_device = task.den_z;
+      if constexpr (den_select == DEN_Y) den_eval_device = task.den_y;
+      if constexpr (den_select == DEN_X) den_eval_device = task.den_x;
+  }
+
+  const auto* basis_eval_device = task.bf;
+
+  const auto* den_basis_prod_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  register double den_reg = 0.;
+
+  if( tid_x < nbf and tid_y < npts ) {
+
+    const double* bf_col   = basis_eval_device     + tid_x*npts;
+    const double* db_col   = den_basis_prod_device + tid_x*npts;
+
+    den_reg = bf_col[ tid_y ]   * db_col[ tid_y ];
+
+  }
+
+  // Warp blocks are stored col major
+  constexpr auto warp_size = cuda::warp_size;
+  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
+  den_reg = cuda::warp_reduce_sum<warp_size>( den_reg );
+
+
+  if( threadIdx.x == 0 and tid_y < npts ) {
+    atomicAdd( den_eval_device   + tid_y, den_reg );
+  }
+  
+}
+
+__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+  // eval_vvars populated uvar storage already in the case of LDA+RKS
+  return;
+}
+__global__ void eval_tmat_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts  = task.npts;
+
+  const auto* v2rho2_device  = task.v2rho2;
+  const auto* weight_device  = task.weights;
+  auto* tden_s_eval_device   = task.tden_s;
+  auto* FXC_A_device   = task.FXC_A_s;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if( tid < npts ) {
+    FXC_A_device[ tid ] = v2rho2_device[ tid ] * tden_s_eval_device[ tid ] * weight_device[ tid ];
+  }
+
+  return;
+}
+
+
+__global__ void eval_uvars_lda_uks_kernel( size_t        ntasks,
+  XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+
+  auto* den_pos_eval_device   = task.den_s;
+  auto* den_neg_eval_device   = task.den_z;
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const auto ps = den_pos_eval_device[ tid ];
+    const auto pz = den_neg_eval_device[ tid ];
+    den_pos_eval_device[ tid ] = 0.5*(ps + pz);
+    den_neg_eval_device[ tid ] = 0.5*(ps - pz);
+  }
+}
+
+__global__ void eval_tmat_lda_uks_kernel( size_t        ntasks,
+  XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+
+  auto* tden_s_device   = task.tden_s;
+  auto* tden_z_device   = task.tden_z;
+  auto* FXC_A_s_device        = task.FXC_A_s;
+  auto* FXC_A_z_device        = task.FXC_A_z;
+  const auto* weight_device   = task.weights;
+
+  const auto* v2rho2_a_a_device    = task.v2rho2_a_a;
+  const auto* v2rho2_a_b_device    = task.v2rho2_a_b;
+  const auto* v2rho2_b_b_device    = task.v2rho2_b_b;
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const auto ps = tden_s_device[ tid ];
+    const auto pz = tden_z_device[ tid ];
+    const auto trho_a_device = 0.5*(ps + pz);
+    const auto trho_b_device = 0.5*(ps - pz);
+    const auto A_a = v2rho2_a_a_device[tid] * trho_a_device + v2rho2_a_b_device[tid] * trho_b_device;
+    const auto A_b = v2rho2_b_b_device[tid] * trho_b_device + v2rho2_a_b_device[tid] * trho_a_device;
+    FXC_A_s_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a + A_b);
+    FXC_A_z_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a - A_b);
+  }
+}
+
+__global__ void eval_uvars_lda_gks_kernel( size_t        ntasks,
+                                           XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+
+  auto* den_z_eval_device   = task.den_s;
+  auto* den_s_eval_device   = task.den_z;
+  auto* den_y_eval_device   = task.den_y;
+  auto* den_x_eval_device   = task.den_x;
+  auto* K_z_eval_device     = task.K_z;
+  auto* K_y_eval_device     = task.K_y;
+  auto* K_x_eval_device     = task.K_x;
+  const double dtolsq = 1e-24;  // TODO: make variable
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+
+  if( tid < npts ) {
+    const auto ps = den_s_eval_device[ tid ];
+    const auto pz = den_z_eval_device[ tid ];
+    const auto py = den_y_eval_device[ tid ];
+    const auto px = den_x_eval_device[ tid ];
+    const auto mtemp = pz*pz + px*px + py*py;
+    double mnorm = 0.;
+  
+    if (mtemp > dtolsq) {
+      const double inv_mnorm = rsqrt(mtemp);
+      mnorm = 1./inv_mnorm;
+      K_z_eval_device[ tid ] = pz * inv_mnorm;
+      K_y_eval_device[ tid ] = py * inv_mnorm;
+      K_x_eval_device[ tid ] = px * inv_mnorm;
+    }
+    else {
+      mnorm = (1. / 3.) * (px + py + pz);
+      K_z_eval_device[ tid ] = 1. / 3.;
+      K_y_eval_device[ tid ] = 1. / 3.;
+      K_x_eval_device[ tid ] = 1. / 3.;
+    }
+
+    den_s_eval_device[ tid ] = 0.5*(ps + mnorm);
+    den_z_eval_device[ tid ] = 0.5*(ps - mnorm);
+
+  }
+}
+
+}
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_mgga.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_mgga.hpp
new file mode 100644
index 00000000..82b5207e
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_mgga.hpp
@@ -0,0 +1,455 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "device_specific/cuda_device_constants.hpp"
+#include "device_specific/cuda_util.hpp"
+#include "device/xc_device_data.hpp"
+
+#define MGGA_KERNEL_SM_BLOCK 32
+
+namespace GauXC {
+
+
+
+template <bool trial, density_id den_select, bool need_lapl>
+__global__ void eval_vvar_mgga_kern( size_t           ntasks,
+                                     XCDeviceTask* tasks_device) {
+
+  constexpr auto warp_size = cuda::warp_size;
+  //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block;
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  double* tau_eval_device  = nullptr;
+  double* lapl_eval_device = nullptr;
+
+  if constexpr (trial){
+    if constexpr (den_select == DEN_S) {
+      tau_eval_device = task.ttau_s;
+      if constexpr (need_lapl) {
+        lapl_eval_device = task.tlapl_s;
+      }
+    }
+    if constexpr (den_select == DEN_Z) {
+      tau_eval_device = task.ttau_z;
+      if constexpr (need_lapl) {
+        lapl_eval_device = task.tlapl_z;
+      }
+    }
+  } else{
+    if constexpr (den_select == DEN_S) {
+      tau_eval_device = task.tau_s;
+      if constexpr (need_lapl) {
+        lapl_eval_device = task.lapl_s;
+      }
+    }
+    if constexpr (den_select == DEN_Z) {
+      tau_eval_device = task.tau_z;
+      if constexpr (need_lapl) {
+        lapl_eval_device = task.lapl_z;
+      }
+    }
+  }
+
+  //const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+  decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr;
+  if constexpr (need_lapl) {
+    basis_lapl_eval_device = task.d2bflapl;
+  }
+
+  //const auto* den_basis_prod_device    = task.zmat;
+  const auto* den_basis_dx_prod_device = task.xmat_x;
+  const auto* den_basis_dy_prod_device = task.xmat_y;
+  const auto* den_basis_dz_prod_device = task.xmat_z;
+  decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr;
+  if constexpr (need_lapl) {
+    den_basis_prod_device = task.zmat;
+  }
+
+  __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1];
+
+  for ( int bid_x = blockIdx.x * blockDim.x; 
+        bid_x < nbf;
+        bid_x += blockDim.x * gridDim.x ) {
+    
+    for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; 
+          bid_y < npts;
+          bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) {
+        
+      for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
+        den_shared[0][threadIdx.x][sm_y] = 0.;
+        den_shared[1][threadIdx.x][sm_y] = 0.;
+        den_shared[2][threadIdx.x][sm_y] = 0.;
+        if constexpr (need_lapl)
+          den_shared[3][threadIdx.x][sm_y] = 0.;
+
+        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
+          const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts;
+          const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts;
+          const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts;
+
+          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
+
+
+          den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ];
+          den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ];
+          den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ];
+
+
+          if constexpr (need_lapl) {
+            const double* db_col   = den_basis_prod_device  + (bid_x + sm_y)*npts;
+            const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts;
+            den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          }
+        }
+      }
+      __syncthreads();
+
+
+      for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) {
+        const int tid_y = bid_y + sm_y;
+
+        register double tx_reg  = den_shared[0][sm_y][threadIdx.x];
+        register double ty_reg  = den_shared[1][sm_y][threadIdx.x];
+        register double tz_reg  = den_shared[2][sm_y][threadIdx.x];
+        // Warp blocks are stored col major
+        register double tau_reg = 0.0;
+        tau_reg  = 0.5 * cuda::warp_reduce_sum<warp_size>( tx_reg );
+        tau_reg += 0.5 * cuda::warp_reduce_sum<warp_size>( ty_reg );
+        tau_reg += 0.5 * cuda::warp_reduce_sum<warp_size>( tz_reg );
+
+        register double lapl_reg = 0.0;
+        if constexpr (need_lapl) {
+          lapl_reg = den_shared[3][sm_y][threadIdx.x];
+          lapl_reg = cuda::warp_reduce_sum<warp_size>(lapl_reg);
+          lapl_reg = 2. * lapl_reg + 4. * tau_reg;
+        }
+
+        if( threadIdx.x == 0 and tid_y < npts ) {
+          atomicAdd( tau_eval_device   + tid_y, tau_reg );
+          if constexpr (need_lapl) {
+            atomicAdd( lapl_eval_device   + tid_y, lapl_reg );
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+
+
+
+template <bool need_lapl>
+__global__ void eval_uvars_mgga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts  = task.npts;
+
+  auto* tau_pos_eval_device = task.tau_s;
+  auto* tau_neg_eval_device = task.tau_z;
+
+  double* lapl_pos_eval_device = nullptr;
+  double* lapl_neg_eval_device = nullptr;
+  if constexpr (need_lapl) {
+    lapl_pos_eval_device = task.lapl_s;
+    lapl_neg_eval_device = task.lapl_z;
+  }
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const double ts = tau_pos_eval_device[ tid ];
+    const double tz = tau_neg_eval_device[ tid ];
+    tau_pos_eval_device[ tid ] = 0.5*(ts + tz);
+    tau_neg_eval_device[ tid ] = 0.5*(ts - tz);
+
+    if constexpr (need_lapl) {
+      const double ls = lapl_pos_eval_device[ tid ];
+      const double lz = lapl_neg_eval_device[ tid ];
+      lapl_pos_eval_device[ tid ] = 0.5*(ls + lz);
+      lapl_neg_eval_device[ tid ] = 0.5*(ls - lz);
+    }
+  }
+
+}
+
+
+__global__ void eval_tmat_mgga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+  
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts  = task.npts;
+  
+  const auto*   dden_sx_eval_device = task.dden_sx;
+  const auto*   dden_sy_eval_device = task.dden_sy;
+  const auto*   dden_sz_eval_device = task.dden_sz;
+  const auto*   tdden_sx_eval_device = task.tdden_sx;
+  const auto*   tdden_sy_eval_device = task.tdden_sy;
+  const auto*   tdden_sz_eval_device = task.tdden_sz;
+
+  const auto* weight_device  = task.weights;
+  const auto* vgamma_device  = task.vgamma;
+  const auto* v2rho2_device     = task.v2rho2;
+  const auto* v2rhogamma_device = task.v2rhogamma;
+  const auto* v2gamma2_device   = task.v2gamma2;
+  const auto* v2rhotau_device  = task.v2rhotau;
+  const auto* v2tau2_device   = task.v2tau2;
+  const auto* v2gammatau_device = task.v2gammatau;
+  const auto* trho_device       = task.tden_s;
+  const auto* ttau_device       = task.ttau_s;
+
+  auto* FXC_A_device   = task.FXC_A_s;
+  auto* FXC_Bx_device   = task.FXC_Bx_s;
+  auto* FXC_By_device   = task.FXC_By_s;
+  auto* FXC_Bz_device   = task.FXC_Bz_s;
+  auto* FXC_C_device   = task.FXC_C_s;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if( tid < npts ) {
+    const auto dx = dden_sx_eval_device[ tid ];
+    const auto dy = dden_sy_eval_device[ tid ];
+    const auto dz = dden_sz_eval_device[ tid ];
+    const auto tdx = tdden_sx_eval_device[ tid ];
+    const auto tdy = tdden_sy_eval_device[ tid ];
+    const auto tdz = tdden_sz_eval_device[ tid ];
+    const auto tgamma = tdx*dx + tdy*dy + tdz*dz;
+
+    const auto FXC_A = v2rho2_device[ tid ] * trho_device[ tid ] + 2.0 * v2rhogamma_device[tid] * tgamma + 
+        v2rhotau_device[ tid ] * ttau_device[ tid ];
+    FXC_A_device[ tid ]  = weight_device[ tid ] * FXC_A;
+
+    const auto FXC_C = v2rhotau_device[ tid ] * trho_device[ tid ] + 2.0 * v2gammatau_device[ tid ] * tgamma +
+        v2tau2_device[ tid ] * ttau_device[ tid ];
+    FXC_C_device[ tid ]  = weight_device[ tid ] * FXC_C;
+
+    const auto B_coef = v2rhogamma_device[tid] * trho_device[tid] + 2.0 * v2gamma2_device[tid] * tgamma +
+        v2gammatau_device[ tid ] * ttau_device[ tid ];
+    FXC_Bx_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dx + vgamma_device[ tid ] * tdx );
+    FXC_By_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dy + vgamma_device[ tid ] * tdy );
+    FXC_Bz_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dz + vgamma_device[ tid ] * tdz );
+  }
+
+}
+
+
+
+__global__ void eval_tmat_mgga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  const auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+
+  const auto* tden_s_device   = task.tden_s;
+  const auto* tden_z_device   = task.tden_z;
+  const auto* ttau_s_device   = task.ttau_s;
+  const auto* ttau_z_device   = task.ttau_z;
+  const auto* weight_device   = task.weights;
+
+  const auto*     tden_pos_x_eval_device = task.tdden_sx;
+  const auto*     tden_pos_y_eval_device = task.tdden_sy;
+  const auto*     tden_pos_z_eval_device = task.tdden_sz;
+  const auto*     den_pos_x_eval_device = task.dden_sx;
+  const auto*     den_pos_y_eval_device = task.dden_sy;
+  const auto*     den_pos_z_eval_device = task.dden_sz;
+
+  const auto*     tden_neg_x_eval_device = task.tdden_zx;
+  const auto*     tden_neg_y_eval_device = task.tdden_zy;
+  const auto*     tden_neg_z_eval_device = task.tdden_zz;
+  const auto*     den_neg_x_eval_device = task.dden_zx;
+  const auto*     den_neg_y_eval_device = task.dden_zy;
+  const auto*     den_neg_z_eval_device = task.dden_zz;
+
+  const double* vgamma_aa_device   = task.vgamma_pp;
+  const double* vgamma_ab_device   = task.vgamma_pm;
+  const double* vgamma_bb_device   = task.vgamma_mm;
+  const double* v2rho2_a_a_device    = task.v2rho2_a_a;
+  const double* v2rho2_a_b_device    = task.v2rho2_a_b;
+  const double* v2rho2_b_b_device    = task.v2rho2_b_b;
+  const double* v2rhogamma_a_aa_device = task.v2rhogamma_a_aa;
+  const double* v2rhogamma_a_ab_device = task.v2rhogamma_a_ab;
+  const double* v2rhogamma_a_bb_device = task.v2rhogamma_a_bb;
+  const double* v2rhogamma_b_aa_device = task.v2rhogamma_b_aa;
+  const double* v2rhogamma_b_ab_device = task.v2rhogamma_b_ab;
+  const double* v2rhogamma_b_bb_device = task.v2rhogamma_b_bb;
+  const double* v2gamma2_aa_aa_device = task.v2gamma2_aa_aa;
+  const double* v2gamma2_aa_ab_device = task.v2gamma2_aa_ab;
+  const double* v2gamma2_aa_bb_device = task.v2gamma2_aa_bb;
+  const double* v2gamma2_ab_ab_device = task.v2gamma2_ab_ab;
+  const double* v2gamma2_ab_bb_device = task.v2gamma2_ab_bb;
+  const double* v2gamma2_bb_bb_device = task.v2gamma2_bb_bb;
+  const double* v2rhotau_a_a_device   = task.v2rhotau_a_a;
+  const double* v2rhotau_a_b_device   = task.v2rhotau_a_b;
+  const double* v2rhotau_b_a_device   = task.v2rhotau_b_a;
+  const double* v2rhotau_b_b_device   = task.v2rhotau_b_b;
+  const double* v2gammatau_aa_a_device= task.v2gammatau_aa_a;
+  const double* v2gammatau_aa_b_device= task.v2gammatau_aa_b;
+  const double* v2gammatau_ab_a_device= task.v2gammatau_ab_a;
+  const double* v2gammatau_ab_b_device= task.v2gammatau_ab_b;
+  const double* v2gammatau_bb_a_device= task.v2gammatau_bb_a;
+  const double* v2gammatau_bb_b_device= task.v2gammatau_bb_b;
+  const double* v2tau2_a_a_device   = task.v2tau2_a_a;
+  const double* v2tau2_a_b_device   = task.v2tau2_a_b;
+  const double* v2tau2_b_b_device   = task.v2tau2_b_b;
+
+  auto* FXC_A_s_device        = task.FXC_A_s;
+  auto* FXC_A_z_device        = task.FXC_A_z;
+  auto* FXC_Bx_s_device       = task.FXC_Bx_s;
+  auto* FXC_Bx_z_device       = task.FXC_Bx_z;
+  auto* FXC_By_s_device       = task.FXC_By_s;
+  auto* FXC_By_z_device       = task.FXC_By_z;
+  auto* FXC_Bz_s_device       = task.FXC_Bz_s;
+  auto* FXC_Bz_z_device       = task.FXC_Bz_z;
+  auto* FXC_C_s_device        = task.FXC_C_s;
+  auto* FXC_C_z_device        = task.FXC_C_z;
+
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if( tid < npts ) {
+    const auto ps = tden_s_device[ tid ];
+    const auto pz = tden_z_device[ tid ];
+    const auto trho_a_device = 0.5*(ps + pz);
+    const auto trho_b_device = 0.5*(ps - pz);
+    const auto ts = ttau_s_device[ tid ];
+    const auto tz = ttau_z_device[ tid ];
+    const auto tau_a = 0.5*(ts + tz);
+    const auto tau_b = 0.5*(ts - tz);
+
+    const auto tdndx   = tden_pos_x_eval_device[ tid ];
+    const auto tdndy   = tden_pos_y_eval_device[ tid ];
+    const auto tdndz   = tden_pos_z_eval_device[ tid ];
+    const auto tdMzdx  = tden_neg_x_eval_device[ tid ];
+    const auto tdMzdy  = tden_neg_y_eval_device[ tid ];
+    const auto tdMzdz  = tden_neg_z_eval_device[ tid ];
+    const auto tdden_a_x = 0.5*(tdndx + tdMzdx);
+    const auto tdden_a_y = 0.5*(tdndy + tdMzdy);
+    const auto tdden_a_z = 0.5*(tdndz + tdMzdz);
+    const auto tdden_b_x = 0.5*(tdndx - tdMzdx);
+    const auto tdden_b_y = 0.5*(tdndy - tdMzdy);
+    const auto tdden_b_z = 0.5*(tdndz - tdMzdz);
+
+    const auto dndx   = den_pos_x_eval_device[ tid ];
+    const auto dndy   = den_pos_y_eval_device[ tid ];
+    const auto dndz   = den_pos_z_eval_device[ tid ];
+    const auto dMzdx  = den_neg_x_eval_device[ tid ];
+    const auto dMzdy  = den_neg_y_eval_device[ tid ];
+    const auto dMzdz  = den_neg_z_eval_device[ tid ];
+    const auto dden_a_x = 0.5*(dndx + dMzdx);
+    const auto dden_a_y = 0.5*(dndy + dMzdy);
+    const auto dden_a_z = 0.5*(dndz + dMzdz);
+    const auto dden_b_x = 0.5*(dndx - dMzdx);
+    const auto dden_b_y = 0.5*(dndy - dMzdy);
+    const auto dden_b_z = 0.5*(dndz - dMzdz);
+
+    const auto tgamma_pp = tdden_a_x * dden_a_x + tdden_a_y * dden_a_y + tdden_a_z * dden_a_z;
+    const auto tgamma_pm = tdden_a_x * dden_b_x + tdden_a_y * dden_b_y + tdden_a_z * dden_b_z
+                                 + tdden_b_x * dden_a_x + tdden_b_y * dden_a_y + tdden_b_z * dden_a_z;
+    const auto tgamma_mm = tdden_b_x * dden_b_x + tdden_b_y * dden_b_y + tdden_b_z * dden_b_z;
+
+    
+    const auto A_a = v2rho2_a_a_device[tid] * trho_a_device + 2.0 * v2rhogamma_a_aa_device[tid] * tgamma_pp + 
+          v2rhogamma_a_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_a_bb_device[tid] * tgamma_mm + 
+          v2rho2_a_b_device[tid] * trho_b_device + v2rhotau_a_a_device[tid] * tau_a +
+          v2rhotau_a_b_device[tid] * tau_b;
+    const auto A_b = v2rho2_b_b_device[tid] * trho_b_device + 2.0 * v2rhogamma_b_bb_device[tid] * tgamma_mm + 
+          v2rhogamma_b_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_b_aa_device[tid] * tgamma_pp + 
+          v2rho2_a_b_device[tid] * trho_a_device + v2rhotau_b_b_device[tid] * tau_b +
+          v2rhotau_b_a_device[tid] * tau_a;
+    FXC_A_s_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a + A_b);
+    FXC_A_z_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a - A_b);
+
+    // Compute C coefficients for alpha and beta spin
+    const auto C_a = v2rhotau_a_a_device[tid] * trho_a_device + v2rhotau_b_a_device[tid] * trho_b_device
+             + 2.0 * v2gammatau_aa_a_device[tid] * tgamma_pp  + v2gammatau_ab_a_device[tid] * tgamma_pm
+             + 2.0 * v2gammatau_bb_a_device[tid] * tgamma_mm
+             + v2tau2_a_a_device[tid] * tau_a + v2tau2_a_b_device[tid] * tau_b;
+
+    const auto C_b = v2rhotau_a_b_device[tid] * trho_a_device + v2rhotau_b_b_device[tid] * trho_b_device
+             + 2.0 * v2gammatau_aa_b_device[tid] * tgamma_pp + v2gammatau_ab_b_device[tid] * tgamma_pm
+             + 2.0 * v2gammatau_bb_b_device[tid] * tgamma_mm
+             + v2tau2_a_b_device[tid] * tau_a + v2tau2_b_b_device[tid] * tau_b;
+
+    FXC_C_s_device[tid] = 0.5 * weight_device[tid] * (C_a + C_b);
+    FXC_C_z_device[tid] = 0.5 * weight_device[tid] * (C_a - C_b);
+
+    // Calculate B coefficients for alpha spin
+    const double B_coef1_a = v2rhogamma_a_aa_device[tid] * trho_a_device   + 2.0 * v2gamma2_aa_aa_device[tid] * tgamma_pp + 
+                 v2gamma2_aa_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_mm + 
+                 v2rhogamma_b_aa_device[tid] * trho_b_device + v2gammatau_aa_a_device[tid] * tau_a +
+                 v2gammatau_aa_b_device[tid] * tau_b;
+
+    const double B_coef2_a = v2rhogamma_a_ab_device[tid] * trho_a_device + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + 
+          v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm +
+          v2rhogamma_b_ab_device[tid] * trho_b_device + v2gammatau_ab_a_device[tid] * tau_a +
+          v2gammatau_ab_b_device[tid] * tau_b;
+
+    // Calculate gradient components for alpha spin
+    const double Bx_a = 2.0 * B_coef1_a * dden_a_x + B_coef2_a * dden_b_x + 
+           2.0 * vgamma_aa_device[tid] * tdden_a_x + vgamma_ab_device[tid] * tdden_b_x;
+
+    const double By_a = 2.0 * B_coef1_a * dden_a_y + B_coef2_a * dden_b_y + 
+           2.0 * vgamma_aa_device[tid] * tdden_a_y + vgamma_ab_device[tid] * tdden_b_y;
+
+    const double Bz_a = 2.0 * B_coef1_a * dden_a_z + B_coef2_a * dden_b_z + 
+           2.0 * vgamma_aa_device[tid] * tdden_a_z + vgamma_ab_device[tid] * tdden_b_z;
+
+    // Calculate B coefficients for beta spin
+    const double B_coef1_b = v2rhogamma_b_bb_device[tid] * trho_b_device + 2.0 * v2gamma2_bb_bb_device[tid] * tgamma_mm + 
+          v2gamma2_ab_bb_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_pp + 
+          v2rhogamma_a_bb_device[tid] * trho_a_device + v2gammatau_bb_b_device[tid] * tau_b +
+          v2gammatau_bb_a_device[tid] * tau_a;
+
+    const double B_coef2_b = v2rhogamma_b_ab_device[tid] * trho_b_device + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm + 
+          v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + 
+          v2rhogamma_a_ab_device[tid] * trho_a_device + v2gammatau_ab_b_device[tid] * tau_b +
+          v2gammatau_ab_a_device[tid] * tau_a;
+
+    const double Bx_b = 2.0 * B_coef1_b * dden_b_x + B_coef2_b * dden_a_x + 
+           2.0 * vgamma_bb_device[tid] * tdden_b_x + vgamma_ab_device[tid] * tdden_a_x;
+
+    const double By_b = 2.0 * B_coef1_b * dden_b_y + B_coef2_b * dden_a_y + 
+           2.0 * vgamma_bb_device[tid] * tdden_b_y + vgamma_ab_device[tid] * tdden_a_y;
+
+    const double Bz_b = 2.0 * B_coef1_b * dden_b_z + B_coef2_b * dden_a_z + 
+           2.0 * vgamma_bb_device[tid] * tdden_b_z + vgamma_ab_device[tid] * tdden_a_z;
+
+    // Store weighted values in output arrays
+    FXC_Bx_s_device[tid] = 0.5 * weight_device[tid] * (Bx_a + Bx_b);
+    FXC_By_s_device[tid] = 0.5 * weight_device[tid] * (By_a + By_b);
+    FXC_Bz_s_device[tid] = 0.5 * weight_device[tid] * (Bz_a + Bz_b);
+    FXC_Bx_z_device[tid] = 0.5 * weight_device[tid] * (Bx_a - Bx_b);
+    FXC_By_z_device[tid] = 0.5 * weight_device[tid] * (By_a - By_b);
+    FXC_Bz_z_device[tid] = 0.5 * weight_device[tid] * (Bz_a - Bz_b);
+
+  }
+
+}
+
+
+}
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_fxc.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_fxc.cu
new file mode 100644
index 00000000..36ba9a16
--- /dev/null
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_fxc.cu
@@ -0,0 +1,238 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include "device/common/zmat_fxc.hpp"
+#include <gauxc/util/div_ceil.hpp>
+#include "device_specific/cuda_util.hpp"
+#include "device_specific/cuda_device_constants.hpp"
+
+namespace GauXC {
+
+
+template<density_id den_selector>
+__global__ void zmat_lda_fxc_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  const auto* FXC_A_device = task.FXC_A_s;
+  if constexpr ( den_selector == DEN_Z ) FXC_A_device = task.FXC_A_z;
+
+  const auto* basis_eval_device = task.bf;
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+    const double fact = 0.5 * FXC_A_device[tid_x];
+
+    z_matrix_device[ ibfoff ] = fact * basis_eval_device[ ibfoff ];
+  }
+
+}
+
+
+
+
+
+template<density_id den_selector>
+__global__ void zmat_gga_fxc_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+  const auto* FXC_A_device   = task.FXC_A_s;
+  const auto* FXC_Bx_device  = task.FXC_Bx_s;
+  const auto* FXC_By_device  = task.FXC_By_s;
+  const auto* FXC_Bz_device  = task.FXC_Bz_s;
+  if constexpr ( den_selector == DEN_Z ) {
+    FXC_A_device   = task.FXC_A_z;
+    FXC_Bx_device  = task.FXC_Bx_z;
+    FXC_By_device  = task.FXC_By_z;
+    FXC_Bz_device  = task.FXC_Bz_z;
+  }
+
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+
+    const double dx = FXC_Bx_device[tid_x] * dbasis_x_eval_device[ ibfoff ];
+    const double dy = FXC_By_device[tid_x] * dbasis_y_eval_device[ ibfoff ];
+    const double dz = FXC_Bz_device[tid_x] * dbasis_z_eval_device[ ibfoff ];
+
+    z_matrix_device[ ibfoff ] = 
+      (0.5 * FXC_A_device[tid_x] * basis_eval_device[ ibfoff ] +  dx + dy + dz ); 
+  }
+}
+
+
+
+#define ZMAT_FXC_KERN(xc_approx) \
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>(); \
+  dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); \
+  dim3 blocks( util::div_ceil( max_npts, threads.x ), \
+               util::div_ceil( max_nbf,  threads.y ), \
+               ntasks ); \
+  if ( sel == DEN_S )       zmat_##xc_approx##_fxc_kernel<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+  else if ( sel == DEN_Z )  zmat_##xc_approx##_fxc_kernel<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \
+
+
+
+void zmat_lda_fxc( size_t            ntasks,
+                   int32_t           max_nbf,
+                   int32_t           max_npts,
+                   XCDeviceTask*     tasks_device,
+                   density_id sel,
+                   device_queue queue ) {
+ZMAT_FXC_KERN(lda)
+}
+
+
+
+void zmat_gga_fxc( size_t            ntasks,
+                   int32_t           max_nbf,
+                   int32_t           max_npts,
+                   XCDeviceTask*     tasks_device,
+                   density_id sel,
+                   device_queue queue ) {
+ZMAT_FXC_KERN(gga)
+}
+
+
+
+void zmat_mgga_fxc( size_t            ntasks,
+                    int32_t           max_nbf,
+                    int32_t           max_npts,
+                    XCDeviceTask*     tasks_device,
+                    bool              do_lapl,
+                    density_id sel,
+                    device_queue queue ) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
+
+
+  dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1);
+  dim3 blocks( util::div_ceil( max_npts, threads.x ),
+               util::div_ceil( max_nbf,  threads.y ),
+               ntasks );
+
+  if(do_lapl)
+    GAUXC_GENERIC_EXCEPTION("Fxc contraction + do_lapl NYI");
+    
+  switch(sel) {
+    case DEN_S:
+        zmat_gga_fxc_kernel<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+      break;
+    case DEN_Z:
+        zmat_gga_fxc_kernel<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+      break;
+  }
+
+}
+
+
+
+
+
+
+
+
+
+
+template <density_id id>
+__global__ void mmat_mgga_fxc_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  auto* FXC_C_s_device   = task.FXC_C_s;
+  if constexpr ( id == DEN_Z ) FXC_C_s_device = task.FXC_C_z;
+
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  auto* mmat_x = task.xmat_x;
+  auto* mmat_y = task.xmat_y;
+  auto* mmat_z = task.xmat_z;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+
+    const double fact = 0.25 * FXC_C_s_device[tid_x];
+
+    mmat_x[ ibfoff ] = fact * dbasis_x_eval_device[ ibfoff ]; 
+    mmat_y[ ibfoff ] = fact * dbasis_y_eval_device[ ibfoff ]; 
+    mmat_z[ ibfoff ] = fact * dbasis_z_eval_device[ ibfoff ]; 
+  }
+}
+
+void mmat_mgga_fxc( size_t            ntasks,
+                    int32_t           max_nbf,
+                    int32_t           max_npts,
+                    XCDeviceTask*     tasks_device,
+                    bool              do_lapl,
+                    density_id sel,
+                    device_queue queue ) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
+
+
+  dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1);
+  dim3 blocks( util::div_ceil( max_npts, threads.x ),
+               util::div_ceil( max_nbf,  threads.y ),
+               ntasks );
+
+  if(do_lapl)
+    GAUXC_GENERIC_EXCEPTION("Fxc contraction + do_lapl NYI");
+    
+  switch(sel) {
+    case DEN_S:
+        mmat_mgga_fxc_kernel<DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+      break;
+    case DEN_Z:
+        mmat_mgga_fxc_kernel<DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+      break;
+  }
+  
+}
+
+}
+
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu
index 616e0bd5..2e695785 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -393,6 +397,9 @@ __global__ void zmat_gga_vxc_gks_kernel( size_t        ntasks,
   }
 }
 
+
+
+
 template <bool need_lapl>
 __global__ void zmat_mgga_vxc_rks_kernel( size_t        ntasks,
                                      XCDeviceTask* tasks_device ) {
@@ -444,6 +451,91 @@ __global__ void zmat_mgga_vxc_rks_kernel( size_t        ntasks,
   }
 }
 
+template<bool need_lapl, density_id den_selector>
+__global__ void zmat_mgga_vxc_uks_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  const double* vrho_pos_device    = task.vrho_pos;
+  const double* vrho_neg_device    = task.vrho_neg;
+  const double* vlapl_pos_device    = task.vlapl_pos;
+  const double* vlapl_neg_device    = task.vlapl_neg;
+  const double* vgamma_pp_device   = task.vgamma_pp;
+  const double* vgamma_pm_device   = task.vgamma_pm;
+  const double* vgamma_mm_device   = task.vgamma_mm;
+
+  const auto* den_pos_x_eval_device = task.dden_sx;
+  const auto* den_pos_y_eval_device = task.dden_sy;
+  const auto* den_pos_z_eval_device = task.dden_sz;
+  const auto* den_neg_x_eval_device = task.dden_zx;
+  const auto* den_neg_y_eval_device = task.dden_zy;
+  const auto* den_neg_z_eval_device = task.dden_zz;
+
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+  const auto* d2basis_lapl_eval_device = task.d2bflapl;
+
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+
+    const double factp = 0.25 * vrho_pos_device[tid_x];
+    const double factm = 0.25 * vrho_neg_device[tid_x];
+    
+    const auto gga_fact_pp  = vgamma_pp_device[tid_x];
+    const auto gga_fact_pm  = vgamma_pm_device[tid_x];
+    const auto gga_fact_mm  = vgamma_mm_device[tid_x];
+    
+    const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm);
+    const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm);
+    const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm);
+
+    double sign = 1.0;
+
+    double x_fact, y_fact, z_fact;
+
+    if constexpr ( den_selector == DEN_S ) {
+       x_fact = gga_fact_1 * den_pos_x_eval_device[ tid_x ] + gga_fact_2 * den_neg_x_eval_device[ tid_x ];
+       y_fact = gga_fact_1 * den_pos_y_eval_device[ tid_x ] + gga_fact_2 * den_neg_y_eval_device[ tid_x ];
+       z_fact = gga_fact_1 * den_pos_z_eval_device[ tid_x ] + gga_fact_2 * den_neg_z_eval_device[ tid_x ];
+    }
+    if constexpr ( den_selector == DEN_Z ) {
+       sign = -1.0;
+       x_fact = gga_fact_3 * den_neg_x_eval_device[ tid_x ] + gga_fact_2 * den_pos_x_eval_device[ tid_x ];
+       y_fact = gga_fact_3 * den_neg_y_eval_device[ tid_x ] + gga_fact_2 * den_pos_y_eval_device[ tid_x ];
+       z_fact = gga_fact_3 * den_neg_z_eval_device[ tid_x ] + gga_fact_2 * den_pos_z_eval_device[ tid_x ];
+    }
+
+    auto val = x_fact * dbasis_x_eval_device[ ibfoff ]      
+             + y_fact * dbasis_y_eval_device[ ibfoff ]
+             + z_fact * dbasis_z_eval_device[ ibfoff ] 
+             + (factp + sign * factm) * basis_eval_device[ ibfoff ];
+
+    if constexpr (need_lapl) {
+      const double lfactp = vlapl_pos_device[tid_x];
+      const double lfactm = vlapl_neg_device[tid_x];
+
+      val += 0.5 * (lfactp + sign * lfactm) * d2basis_lapl_eval_device[ ibfoff ];
+    }
+
+    z_matrix_device[ ibfoff ] = val;
+  }
+}
+
 
 
 #define ZMAT_VXC_KERN(xc_approx) \
@@ -503,6 +595,8 @@ void zmat_mgga_vxc( size_t            ntasks,
                     int32_t           max_npts,
                     XCDeviceTask*     tasks_device,
                     bool              do_lapl,
+                    integrator_ks_scheme scheme,
+                    density_id sel,
                     device_queue queue ) {
 
   cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
@@ -513,10 +607,29 @@ void zmat_mgga_vxc( size_t            ntasks,
                util::div_ceil( max_nbf,  threads.y ),
                ntasks );
 
-  if(do_lapl)
-    zmat_mgga_vxc_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
-  else
-    zmat_mgga_vxc_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+  if(scheme == RKS) {
+    if(do_lapl)
+      zmat_mgga_vxc_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+    else
+      zmat_mgga_vxc_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+  } else if(scheme == UKS) {
+    switch(sel) {
+      case DEN_S:
+        if(do_lapl)
+          zmat_mgga_vxc_uks_kernel<true, DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        else
+          zmat_mgga_vxc_uks_kernel<false, DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        break;
+      case DEN_Z:
+        if(do_lapl)
+          zmat_mgga_vxc_uks_kernel<true, DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        else
+          zmat_mgga_vxc_uks_kernel<false, DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        break;
+    }
+  } else {
+    GAUXC_GENERIC_EXCEPTION("MGGA + DEVICE + GKS NYI");
+  }
 
 }
 
@@ -571,6 +684,55 @@ __global__ void mmat_mgga_vxc_rks_kernel( size_t        ntasks,
   }
 }
 
+template <bool need_lapl, density_id id>
+__global__ void mmat_mgga_vxc_uks_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  const auto* vtau_pos_device    = task.vtau_pos;
+  const auto* vtau_neg_device    = task.vtau_neg;
+  const double* vlapl_pos_device = need_lapl ? task.vlapl_pos : nullptr;
+  const double* vlapl_neg_device = need_lapl ? task.vlapl_neg : nullptr;
+
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  auto* mmat_x = task.xmat_x;
+  auto* mmat_y = task.xmat_y;
+  auto* mmat_z = task.xmat_z;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    double sign = 1.0;
+    if(id == DEN_Z) sign = -1;
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+    const auto tfactp = 0.25 * vtau_pos_device[tid_x];
+    const auto tfactm = 0.25 * vtau_neg_device[tid_x];
+    const double fact_tau = 0.5 * (tfactp + sign * tfactm);
+    double fact_lapl = 0.0;
+    if(need_lapl) {
+      const auto lfactp = vlapl_pos_device[tid_x];
+      const auto lfactm = vlapl_neg_device[tid_x];
+      fact_lapl = 0.5 * (lfactp + sign * lfactm);
+    }
+    const double fact_1 = fact_tau + fact_lapl;
+
+    mmat_x[ ibfoff ] = fact_1 * dbasis_x_eval_device[ ibfoff ]; 
+    mmat_y[ ibfoff ] = fact_1 * dbasis_y_eval_device[ ibfoff ]; 
+    mmat_z[ ibfoff ] = fact_1 * dbasis_z_eval_device[ ibfoff ]; 
+  }
+}
+
 //__global__ void print_zmat_stats( size_t            ntasks,
 //                    XCDeviceTask*     tasks_device) {
 //
@@ -597,7 +759,7 @@ __global__ void mmat_mgga_vxc_rks_kernel( size_t        ntasks,
 //    const auto* vrho   = task.vrho;
 //    const auto* gamma = task.gamma;
 //    const auto* tau   = task.tau;
-//    const auto* lapl   = task.denlapl;
+//    const auto* lapl   = task.lapl;
 //    const auto* rho   = task.den;
 //    double enrm = 0.0, gnrm = 0.0, tnrm = 0.0, rnrm = 0.0, lnrm = 0.0;
 //    double vgnrm = 0.0, vtnrm = 0.0, vrnrm = 0.0, vlnrm = 0.0;
@@ -625,6 +787,8 @@ void mmat_mgga_vxc( size_t            ntasks,
                     int32_t           max_npts,
                     XCDeviceTask*     tasks_device,
                     bool              do_lapl,
+                    integrator_ks_scheme scheme,
+                    density_id sel,
                     device_queue queue ) {
 
   cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;
@@ -635,10 +799,30 @@ void mmat_mgga_vxc( size_t            ntasks,
                util::div_ceil( max_nbf,  threads.y ),
                ntasks );
 
-  if(do_lapl)
-    mmat_mgga_vxc_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
-  else
-    mmat_mgga_vxc_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+  if(scheme == RKS) {
+    if(do_lapl)
+      mmat_mgga_vxc_rks_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+    else
+      mmat_mgga_vxc_rks_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+  } else if(scheme == UKS) {
+    switch(sel) {
+      case DEN_S:
+        if(do_lapl)
+          mmat_mgga_vxc_uks_kernel<true, DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        else
+          mmat_mgga_vxc_uks_kernel<false, DEN_S><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        break;
+      case DEN_Z:
+        if(do_lapl)
+          mmat_mgga_vxc_uks_kernel<true, DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        else
+          mmat_mgga_vxc_uks_kernel<false, DEN_Z><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+        break;
+    }
+  } else {
+    GAUXC_GENERIC_EXCEPTION("MGGA + DEVICE + GKS NYI");
+  }
+  
 
   //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device);
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt
index 5f18e92f..6bdf66a7 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp
index 8726eba1..110f76d0 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp
index 552656ea..3a2ca466 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp
index 22c554b7..14ea3134 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu
index b6385468..e607d086 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp
index 98f8543b..d8a472f3 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu
index 17a74fb1..23eb95ce 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu
index a5db5b61..6779a4ea 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu
index 248977a8..ec51003c 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -355,7 +359,7 @@ struct DeviceTask00 {
 
   static constexpr bool use_shared = (primpair_shared_limit > 0) && 
                                      (primpair_shared_limit <= max_primpair_shared_limit);
-  static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+  static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
   // Cannot declare shared memory array with length 0
   static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
@@ -382,8 +386,8 @@ struct DeviceTask00 {
     double *Gi = param.Gi;
     double *Gj = param.Gj;
 
-    const int laneId = threadIdx.x % cuda::warp_size;
-    const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size;
+    const int laneId = threadIdx.x % GauXC::cuda::warp_size;
+    const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size;
 
     __shared__ GauXC::PrimitivePair<double> s_prim_pairs[prim_buffer_size] __attribute__((unused));
 
@@ -397,7 +401,7 @@ struct DeviceTask00 {
     for (int i = 0; i <  num_warps; i++) {
       double temp = SCALAR_ZERO();
 
-      const int pointIndex = i * cuda::warp_size + laneId;
+      const int pointIndex = i * GauXC::cuda::warp_size + laneId;
 
       if (pointIndex < npts) {
 
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu
index 02d4d40b..62cd3d53 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu
index 334fa3bb..667e851b 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu
index bf9a4841..16f8324e 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu
index 9dfcad3f..71313b04 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -352,7 +356,8 @@ using namespace GauXC;
 
   }
 
-template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_>
+template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_,
+         bool pure_bra>
 struct DeviceTask10 {
   static constexpr int max_primpair_shared_limit = 32;
 
@@ -365,7 +370,7 @@ struct DeviceTask10 {
 
   static constexpr bool use_shared = (primpair_shared_limit > 0) && 
                                      (primpair_shared_limit <= max_primpair_shared_limit);
-  static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+  static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
   // Cannot declare shared memory array with length 0
   static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
@@ -393,12 +398,12 @@ struct DeviceTask10 {
     double *Gj = param.Gj;
 
     static constexpr bool use_shared = (primpair_shared_limit > 0);
-    static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+    static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
     // Cannot declare shared memory array with length 0
     static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
-    const int laneId = threadIdx.x % cuda::warp_size;
-    const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size;
+    const int laneId = threadIdx.x % GauXC::cuda::warp_size;
+    const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size;
 
     __shared__ GauXC::PrimitivePair<double> s_prim_pairs[prim_buffer_size] __attribute__((unused));
 
@@ -414,7 +419,7 @@ struct DeviceTask10 {
       double temp_1 = SCALAR_ZERO();
       double temp_2 = SCALAR_ZERO();
 
-      const int pointIndex = i * cuda::warp_size + laneId;
+      const int pointIndex = i * GauXC::cuda::warp_size + laneId;
 
       if (pointIndex < npts) {
         const double point_x = s_task_data[pointIndex].x;
@@ -490,31 +495,46 @@ struct DeviceTask10 {
           SCALAR_TYPE const_value_w;
           SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2;
 
+          SCALAR_TYPE Xik_0, Xik_1, Xik_2;
+
+          if constexpr (pure_bra) {
+            Xik_0 = SCALAR_LOAD((Xik + 2*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 1*ldX));
+          } else {
+            Xik_0 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 1*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 2*ldX));
+          }
+
           X_ABp = 1.0; comb_m_i = 1.0;
           Y_ABp = 1.0; comb_n_j = 1.0;
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           ty = SCALAR_LOAD((Xjk + 0 * ldX));
 
           t0 = SCALAR_MUL(temp_0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          atomicAdd((Gik + 0 * ldG), tz);
+          if constexpr (pure_bra) atomicAdd((Gik + 2 * ldG), tz);
+          else                    atomicAdd((Gik + 0 * ldG), tz);
                                          
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          atomicAdd((Gik + 1 * ldG), tz);
+          if constexpr (pure_bra) atomicAdd((Gik + 0 * ldG), tz);
+          else                    atomicAdd((Gik + 1 * ldG), tz);
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          atomicAdd((Gik + 2 * ldG), tz);
+          if constexpr (pure_bra) atomicAdd((Gik + 1 * ldG), tz);
+          else                    atomicAdd((Gik + 2 * ldG), tz);
 
           atomicAdd((Gjk + 0 * ldG), tw);
         }
@@ -525,15 +545,28 @@ struct DeviceTask10 {
 };
 
 template <int primpair_limit>
-using AM10_swap = DeviceTask10<ObaraSaikaType::swap,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM10_swap_cart = DeviceTask10<ObaraSaikaType::swap,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false>;
+
+template <int primpair_limit>
+using AM10_cart = DeviceTask10<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false>;
+
+template <int primpair_limit>
+using AM10_swap_sph = DeviceTask10<ObaraSaikaType::swap,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true>;
 
 template <int primpair_limit>
-using AM10 = DeviceTask10<ObaraSaikaType::base,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM10_sph = DeviceTask10<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true>;
 
   void integral_1_0_task_batched(
     bool swap,
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -554,21 +587,39 @@ using AM10 = DeviceTask10<ObaraSaikaType::base,
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
 
     if (swap) {
-      dev_integral_task_map_dispatcher<AM10_swap>(
-        nblocks, nthreads, max_primpair, stream, 
-        ntasks, nsubtask,
-        device_tasks, task2sp, 
-        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-        boys_table );
+      if(sph)
+        dev_integral_task_map_dispatcher<AM10_swap_sph>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else
+        dev_integral_task_map_dispatcher<AM10_swap_cart>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
     } else {
-      dev_integral_task_map_dispatcher<AM10>(
-        nblocks, nthreads, max_primpair, stream, 
-        ntasks, nsubtask,
-        device_tasks, task2sp, 
-        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-        boys_table );
+      if(sph)
+        dev_integral_task_map_dispatcher<AM10_sph>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else
+        dev_integral_task_map_dispatcher<AM10_cart>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
     }
   }
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu
index 3851a865..21273e23 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -33,6 +37,7 @@ namespace XGPU {
 
   void integral_1_0_task_batched(
     bool swap,
+    bool sph,
     size_t ntasks,
     size_t nsubtasks,
     int max_primpairs, size_t max_nsp,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu
index f60591c4..fae49afc 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -780,7 +784,8 @@ using namespace GauXC;
 
   }
 
-template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_>
+template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_,
+         bool pure_bra, bool pure_ket>
 struct DeviceTask11 {
   static constexpr int max_primpair_shared_limit = 32;
 
@@ -794,7 +799,7 @@ struct DeviceTask11 {
 
   static constexpr bool use_shared = (primpair_shared_limit > 0) && 
                                      (primpair_shared_limit <= max_primpair_shared_limit);
-  static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+  static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
   // Cannot declare shared memory array with length 0
   static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
@@ -825,12 +830,12 @@ struct DeviceTask11 {
     const double Z_AB = param.Z_AB;
 
     static constexpr bool use_shared = (primpair_shared_limit > 0);
-    static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+    static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
     // Cannot declare shared memory array with length 0
     static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
-    const int laneId = threadIdx.x % cuda::warp_size;
-    const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size;
+    const int laneId = threadIdx.x % GauXC::cuda::warp_size;
+    const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size;
 
     __shared__ GauXC::PrimitivePair<double> s_prim_pairs[prim_buffer_size] __attribute__((unused));
 
@@ -859,7 +864,7 @@ struct DeviceTask11 {
       temp_7 = SCALAR_ZERO();
       temp_8 = SCALAR_ZERO();
 
-      const int pointIndex = i * cuda::warp_size + laneId;
+      const int pointIndex = i * GauXC::cuda::warp_size + laneId;
 
       if (pointIndex < npts) {
         const double point_x = s_task_data[pointIndex].x;
@@ -990,6 +995,34 @@ struct DeviceTask11 {
           SCALAR_TYPE const_value_w;
           SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2;
 
+          SCALAR_TYPE Xik_0, Xik_1, Xik_2;
+          SCALAR_TYPE Xjk_0, Xjk_1, Xjk_2;
+          SCALAR_TYPE Gjk_0, Gjk_1, Gjk_2;
+
+          if constexpr (pure_bra) {
+            Xik_0 = SCALAR_LOAD((Xik + 2*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 1*ldX));
+          } else {
+            Xik_0 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 1*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 2*ldX));
+          }
+
+          if constexpr (pure_ket) {
+            Xjk_0 = SCALAR_LOAD((Xjk + 2*ldX));
+            Xjk_1 = SCALAR_LOAD((Xjk + 0*ldX));
+            Xjk_2 = SCALAR_LOAD((Xjk + 1*ldX));
+          } else {
+            Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX));
+            Xjk_1 = SCALAR_LOAD((Xjk + 1*ldX));
+            Xjk_2 = SCALAR_LOAD((Xjk + 2*ldX));
+          }
+
+          Gjk_0 = 0;
+          Gjk_1 = 0;
+          Gjk_2 = 0;
+
           /**** j = 0 ****/
           X_ABp = 1.0; comb_m_i = 1.0;
           Y_ABp = 1.0; comb_n_j = 1.0;
@@ -997,20 +1030,20 @@ struct DeviceTask11 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 0 * ldX));
+          tx = Xik_0;
+          ty = Xjk_0;
           t0 = SCALAR_MUL(temp_3, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
           outBuffer[threadIdx.x][0] += tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_4, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
           outBuffer[threadIdx.x][1] += tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_5, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
@@ -1022,24 +1055,24 @@ struct DeviceTask11 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_MUL(temp_0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
           outBuffer[threadIdx.x][0] += tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
           outBuffer[threadIdx.x][1] += tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
           outBuffer[threadIdx.x][2] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 0 * ldG), tw);
+          if constexpr (!diag) Gjk_0 = tw;
 
           /**** j = 1 ****/
           X_ABp = 1.0; comb_m_i = 1.0;
@@ -1048,20 +1081,20 @@ struct DeviceTask11 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 1 * ldX));
+          tx = Xik_0;
+          ty = Xjk_1;
           t0 = SCALAR_MUL(temp_4, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
           outBuffer[threadIdx.x][0] += tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_6, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
           outBuffer[threadIdx.x][1] += tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_7, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
@@ -1072,24 +1105,24 @@ struct DeviceTask11 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_MUL(temp_0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
           outBuffer[threadIdx.x][0] += tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
           outBuffer[threadIdx.x][1] += tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
           outBuffer[threadIdx.x][2] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 1 * ldG), tw);
+          if constexpr (!diag) Gjk_1 = tw;
 
           /**** j = 2 ****/
           X_ABp = 1.0; comb_m_i = 1.0;
@@ -1098,20 +1131,20 @@ struct DeviceTask11 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 2 * ldX));
+          tx = Xik_0;
+          ty = Xjk_2;
           t0 = SCALAR_MUL(temp_5, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
           outBuffer[threadIdx.x][0] += tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_7, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
           outBuffer[threadIdx.x][1] += tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_8, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
@@ -1121,28 +1154,46 @@ struct DeviceTask11 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_MUL(temp_0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
           outBuffer[threadIdx.x][0] += tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
           outBuffer[threadIdx.x][1] += tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
           outBuffer[threadIdx.x][2] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 2 * ldG), tw);
-
-          atomicAdd((Gik + 0 * ldG), outBuffer[threadIdx.x][0]);
-          atomicAdd((Gik + 1 * ldG), outBuffer[threadIdx.x][1]);
-          atomicAdd((Gik + 2 * ldG), outBuffer[threadIdx.x][2]);
+          if constexpr (!diag) Gjk_2 = tw;
+
+          if constexpr (!diag) {
+            if constexpr (pure_ket) {
+              atomicAdd((Gjk + 2 * ldG), Gjk_0);
+              atomicAdd((Gjk + 0 * ldG), Gjk_1);
+              atomicAdd((Gjk + 1 * ldG), Gjk_2);
+            } else {
+              atomicAdd((Gjk + 0 * ldG), Gjk_0);
+              atomicAdd((Gjk + 1 * ldG), Gjk_1);
+              atomicAdd((Gjk + 2 * ldG), Gjk_2);
+            }
+          }
+
+          if constexpr (pure_bra) {
+            atomicAdd((Gik + 2 * ldG), outBuffer[threadIdx.x][0]);
+            atomicAdd((Gik + 0 * ldG), outBuffer[threadIdx.x][1]);
+            atomicAdd((Gik + 1 * ldG), outBuffer[threadIdx.x][2]);
+          } else {
+            atomicAdd((Gik + 0 * ldG), outBuffer[threadIdx.x][0]);
+            atomicAdd((Gik + 1 * ldG), outBuffer[threadIdx.x][1]);
+            atomicAdd((Gik + 2 * ldG), outBuffer[threadIdx.x][2]);
+          }
 
         }
       }
@@ -1152,14 +1203,26 @@ struct DeviceTask11 {
 };
 
 template <int primpair_limit>
-using AM11 = DeviceTask11<ObaraSaikaType::base,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
-
+using AM11_cart = DeviceTask11<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false, false>;
 template <int primpair_limit>
-using AM1 = DeviceTask11<ObaraSaikaType::diag,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM1_cart = DeviceTask11<ObaraSaikaType::diag,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false, false>;
+template <int primpair_limit>
+using AM11_sph = DeviceTask11<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, true>;
+template <int primpair_limit>
+using AM1_sph = DeviceTask11<ObaraSaikaType::diag,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, true>;
+
+
 
   void integral_1_1_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -1179,16 +1242,26 @@ using AM1 = DeviceTask11<ObaraSaikaType::diag,
     dim3 nblocks(nblocks_x, nblocks_y, nblocks_z);
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
     
-    dev_integral_task_map_dispatcher<AM11>(
-      nblocks, nthreads, max_primpair, stream, 
-      ntasks, nsubtask,
-      device_tasks, task2sp, 
-      (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-      sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-      boys_table );
+    if(sph)
+      dev_integral_task_map_dispatcher<AM11_sph>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
+    else
+      dev_integral_task_map_dispatcher<AM11_cart>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
   }
 
   void integral_1_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -1208,12 +1281,21 @@ using AM1 = DeviceTask11<ObaraSaikaType::diag,
     dim3 nblocks(nblocks_x, nblocks_y, nblocks_z);
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
     
-    dev_integral_task_map_dispatcher<AM1>(
-      nblocks, nthreads, max_primpair, stream, 
-      ntasks, nsubtask,
-      device_tasks, task2sp, 
-      (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-      sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-      boys_table );
+    if(sph)
+      dev_integral_task_map_dispatcher<AM1_sph>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
+    else
+      dev_integral_task_map_dispatcher<AM1_cart>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
   }
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu
index f19342e4..222765fd 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -38,6 +42,7 @@ namespace XGPU {
         cudaStream_t stream); 
 
   void integral_1_1_task_batched(
+    bool sph,
     size_t ntasks,
     size_t nsubtasks,
     int max_primpairs, size_t max_nsp,
@@ -53,6 +58,7 @@ namespace XGPU {
     cudaStream_t stream);
 
   void integral_1_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpairs, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu
index 7c555bd6..e8318b2a 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu
index 5c250e4b..09e63bd4 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu
index 70631f8c..ecbfa6e3 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -450,7 +454,8 @@ using namespace GauXC;
         nsp, sp2task, device_tasks, boys_table );
   }
 
-template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_>
+template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_,
+         bool pure_bra>
 struct DeviceTask20 {
   static constexpr int max_primpair_shared_limit = 32;
 
@@ -463,7 +468,7 @@ struct DeviceTask20 {
 
   static constexpr bool use_shared = (primpair_shared_limit > 0) && 
                                      (primpair_shared_limit <= max_primpair_shared_limit);
-  static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+  static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
   // Cannot declare shared memory array with length 0
   static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
@@ -491,12 +496,12 @@ struct DeviceTask20 {
     double *Gj = param.Gj;
 
     static constexpr bool use_shared = (primpair_shared_limit > 0);
-    static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+    static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
     // Cannot declare shared memory array with length 0
     static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
-    const int laneId = threadIdx.x % cuda::warp_size;
-    const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size;
+    const int laneId = threadIdx.x % GauXC::cuda::warp_size;
+    const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size;
 
     __shared__ GauXC::PrimitivePair<double> s_prim_pairs[prim_buffer_size] __attribute__((unused));
 
@@ -516,7 +521,7 @@ struct DeviceTask20 {
       double temp_4 = SCALAR_ZERO();
       double temp_5 = SCALAR_ZERO();
 
-      const int pointIndex = i * cuda::warp_size + laneId;
+      const int pointIndex = i * GauXC::cuda::warp_size + laneId;
 
       if (pointIndex < npts) {
 
@@ -649,47 +654,90 @@ struct DeviceTask20 {
           SCALAR_TYPE const_value_w;
           SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5;
 
+          SCALAR_TYPE Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5;
+          SCALAR_TYPE Xjk_0;
+          SCALAR_TYPE Gik_0, Gik_1, Gik_2, Gik_3, Gik_4, Gik_5;
+
+          if constexpr (pure_bra) {
+            SCALAR_TYPE Xik_m2 = SCALAR_LOAD((Xik + 0*ldX));
+            SCALAR_TYPE Xik_m1 = SCALAR_LOAD((Xik + 1*ldX));
+            SCALAR_TYPE Xik_z0 = SCALAR_LOAD((Xik + 2*ldX));
+            SCALAR_TYPE Xik_p1 = SCALAR_LOAD((Xik + 3*ldX));
+            SCALAR_TYPE Xik_p2 = SCALAR_LOAD((Xik + 4*ldX));
+
+            ::cuda::std::tie(Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5) =
+              sph::itform_l2(Xik_m2, Xik_m1, Xik_z0, Xik_p1, Xik_p2);
+          } else {
+            Xik_0 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 1*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 2*ldX));
+            Xik_3 = SCALAR_LOAD((Xik + 3*ldX));
+            Xik_4 = SCALAR_LOAD((Xik + 4*ldX));
+            Xik_5 = SCALAR_LOAD((Xik + 5*ldX));
+          }
+
+          Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX));
+
           X_ABp = 1.0; comb_m_i = 1.0;
           Y_ABp = 1.0; comb_n_j = 1.0;
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 0 * ldX));
+          tx = Xik_0;
+          ty = Xjk_0;
           t0 = SCALAR_MUL(temp_0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          atomicAdd((Gik + 0 * ldG), tz);
+          Gik_0 = tz;
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_MUL(temp_1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          atomicAdd((Gik + 1 * ldG), tz);
+          Gik_1 = tz;
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_MUL(temp_2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          atomicAdd((Gik + 2 * ldG), tz);
+          Gik_2 = tz;
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_MUL(temp_3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          atomicAdd((Gik + 3 * ldG), tz);
+          Gik_3 = tz;
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_MUL(temp_4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          atomicAdd((Gik + 4 * ldG), tz);
+          Gik_4 = tz;
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_MUL(temp_5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          atomicAdd((Gik + 5 * ldG), tz);
+          Gik_5 = tz;
+
+          if constexpr (pure_bra) {
+            SCALAR_TYPE Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2;
+            
+            ::cuda::std::tie(Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2) =
+              sph::tform_l2(Gik_0, Gik_1, Gik_2, Gik_3, Gik_4, Gik_5);
+            atomicAdd((Gik + 0 * ldG), Gik_m2);
+            atomicAdd((Gik + 1 * ldG), Gik_m1);
+            atomicAdd((Gik + 2 * ldG), Gik_z0);
+            atomicAdd((Gik + 3 * ldG), Gik_p1);
+            atomicAdd((Gik + 4 * ldG), Gik_p2);
+          } else {
+            atomicAdd((Gik + 0 * ldG), Gik_0);
+            atomicAdd((Gik + 1 * ldG), Gik_1);
+            atomicAdd((Gik + 2 * ldG), Gik_2);
+            atomicAdd((Gik + 3 * ldG), Gik_3);
+            atomicAdd((Gik + 4 * ldG), Gik_4);
+            atomicAdd((Gik + 5 * ldG), Gik_5);
+          }
 
           atomicAdd((Gjk + 0 * ldG), tw);
         }
@@ -700,16 +748,28 @@ struct DeviceTask20 {
 };
 
 template <int primpair_limit>
-using AM20_swap = DeviceTask20<ObaraSaikaType::swap,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM20_swap_cart = DeviceTask20<ObaraSaikaType::swap,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false>;
 
 template <int primpair_limit>
-using AM20 = DeviceTask20<ObaraSaikaType::base,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM20_cart = DeviceTask20<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false>;
 
+template <int primpair_limit>
+using AM20_swap_sph = DeviceTask20<ObaraSaikaType::swap,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true>;
+
+template <int primpair_limit>
+using AM20_sph = DeviceTask20<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true>;
 
   void integral_2_0_task_batched(
     bool swap,
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -731,21 +791,39 @@ using AM20 = DeviceTask20<ObaraSaikaType::base,
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
 
     if (swap) {
-      dev_integral_task_map_dispatcher<AM20_swap>(
-        nblocks, nthreads, max_primpair, stream, 
-        ntasks, nsubtask,
-        device_tasks, task2sp, 
-        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-        boys_table );
+      if(sph)
+        dev_integral_task_map_dispatcher<AM20_swap_sph>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else
+        dev_integral_task_map_dispatcher<AM20_swap_cart>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
     } else {
-      dev_integral_task_map_dispatcher<AM20>(
-        nblocks, nthreads, max_primpair, stream, 
-        ntasks, nsubtask,
-        device_tasks, task2sp, 
-        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-        boys_table );
+      if(sph)
+        dev_integral_task_map_dispatcher<AM20_sph>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else
+        dev_integral_task_map_dispatcher<AM20_cart>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
     }
   }
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu
index 7b261336..47b6cec0 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -33,6 +37,7 @@ namespace XGPU {
 
   void integral_2_0_task_batched(
     bool swap,
+    bool sph,
     size_t ntasks,
     size_t nsubtasks,
     int max_primpairs, size_t max_nsp,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu
index e5e84a01..153bcf7f 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -1197,7 +1201,8 @@ using namespace GauXC;
 
   }
 
-template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_>
+template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_,
+         bool pure_bra, bool pure_ket>
 struct DeviceTask21 {
   static constexpr int max_primpair_shared_limit = 8;
 
@@ -1210,7 +1215,7 @@ struct DeviceTask21 {
 
   static constexpr bool use_shared = (primpair_shared_limit > 0) && 
                                      (primpair_shared_limit <= max_primpair_shared_limit);
-  static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+  static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
   // Cannot declare shared memory array with length 0
   static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
@@ -1240,8 +1245,8 @@ struct DeviceTask21 {
     const double Y_AB = param.Y_AB;
     const double Z_AB = param.Z_AB;
 
-    const int laneId = threadIdx.x % cuda::warp_size;
-    const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size;
+    const int laneId = threadIdx.x % GauXC::cuda::warp_size;
+    const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size;
 
     __shared__ GauXC::PrimitivePair<double> s_prim_pairs[prim_buffer_size] __attribute__((unused));
 
@@ -1263,7 +1268,7 @@ struct DeviceTask21 {
       }
       for(int j = 0; j < 16; ++j) SCALAR_STORE((temp + j * blockDim.x + threadIdx.x), SCALAR_ZERO());
 
-      const int pointIndex = i * cuda::warp_size + laneId;
+      const int pointIndex = i * GauXC::cuda::warp_size + laneId;
 
       if (pointIndex < npts) {
         const double point_x = s_task_data[pointIndex].x;
@@ -1491,6 +1496,43 @@ struct DeviceTask21 {
           SCALAR_TYPE const_value_w;
           SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5;
 
+          SCALAR_TYPE Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5;
+          SCALAR_TYPE Xjk_0, Xjk_1, Xjk_2;
+          SCALAR_TYPE Gjk_0, Gjk_1, Gjk_2;
+
+          if constexpr (pure_bra) {
+            SCALAR_TYPE Xik_m2 = SCALAR_LOAD((Xik + 0*ldX));
+            SCALAR_TYPE Xik_m1 = SCALAR_LOAD((Xik + 1*ldX));
+            SCALAR_TYPE Xik_z0 = SCALAR_LOAD((Xik + 2*ldX));
+            SCALAR_TYPE Xik_p1 = SCALAR_LOAD((Xik + 3*ldX));
+            SCALAR_TYPE Xik_p2 = SCALAR_LOAD((Xik + 4*ldX));
+
+            ::cuda::std::tie(Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5) =
+              sph::itform_l2(Xik_m2, Xik_m1, Xik_z0, Xik_p1, Xik_p2);
+          } else {
+            Xik_0 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 1*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 2*ldX));
+            Xik_3 = SCALAR_LOAD((Xik + 3*ldX));
+            Xik_4 = SCALAR_LOAD((Xik + 4*ldX));
+            Xik_5 = SCALAR_LOAD((Xik + 5*ldX));
+          }
+
+          
+          if constexpr (pure_ket) {
+            Xjk_0 = SCALAR_LOAD((Xjk + 2*ldX));
+            Xjk_1 = SCALAR_LOAD((Xjk + 0*ldX));
+            Xjk_2 = SCALAR_LOAD((Xjk + 1*ldX));
+          } else {
+            Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX));
+            Xjk_1 = SCALAR_LOAD((Xjk + 1*ldX));
+            Xjk_2 = SCALAR_LOAD((Xjk + 2*ldX));
+          }
+
+          Gjk_0 = 0;
+          Gjk_1 = 0;
+          Gjk_2 = 0;
+
           X_ABp = 1.0; comb_m_i = 1.0;
           Y_ABp = 1.0; comb_n_j = 1.0;
           Z_ABp = 1.0; comb_p_k = 1.0;
@@ -1499,53 +1541,47 @@ struct DeviceTask21 {
 
     /*** j = 0 ***/
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 0 * ldX));
+          tx = Xik_0;
+          ty = Xjk_0;
           t0 = SCALAR_LOAD((temp + 6 * blockDim.x + threadIdx.x));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 7 * blockDim.x + threadIdx.x));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 8 * blockDim.x + threadIdx.x));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 9 * blockDim.x + threadIdx.x));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 10 * blockDim.x + threadIdx.x));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 11 * blockDim.x + threadIdx.x));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz; 
 
           X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1));
@@ -1554,54 +1590,48 @@ struct DeviceTask21 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 * blockDim.x + threadIdx.x));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 * blockDim.x + threadIdx.x));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 * blockDim.x + threadIdx.x));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 * blockDim.x + threadIdx.x));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 * blockDim.x + threadIdx.x));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 * blockDim.x + threadIdx.x));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz; 
-          atomicAdd((Gjk + 0 * ldG), tw);
+          Gjk_0 += tw;
 
 
     /*** j = 1 ***/
@@ -1610,53 +1640,47 @@ struct DeviceTask21 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 1 * ldX));
+          tx = Xik_0;
+          ty = Xjk_1;
           t0 = SCALAR_LOAD((temp + 7 * blockDim.x + threadIdx.x));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 9 * blockDim.x + threadIdx.x));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 10 * blockDim.x + threadIdx.x));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 12 * blockDim.x + threadIdx.x));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 13 * blockDim.x + threadIdx.x));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 14 * blockDim.x + threadIdx.x));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz; 
 
           Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1));
@@ -1664,54 +1688,48 @@ struct DeviceTask21 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 * blockDim.x + threadIdx.x));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 * blockDim.x + threadIdx.x));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 * blockDim.x + threadIdx.x));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 * blockDim.x + threadIdx.x));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 * blockDim.x + threadIdx.x));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 * blockDim.x + threadIdx.x));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz; 
-          atomicAdd((Gjk + 1 * ldG), tw);
+          Gjk_1 += tw;
 
     /*** j = 2 ***/
           X_ABp = 1.0; comb_m_i = 1.0;
@@ -1720,114 +1738,126 @@ struct DeviceTask21 {
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 2 * ldX));
+          tx = Xik_0;
+          ty = Xjk_2;
           t0 = SCALAR_LOAD((temp + 8 * blockDim.x + threadIdx.x));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 10 * blockDim.x + threadIdx.x));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 11 * blockDim.x + threadIdx.x));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 13 * blockDim.x + threadIdx.x));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 14 * blockDim.x + threadIdx.x));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 15 * blockDim.x + threadIdx.x));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz; 
 
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
 
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 * blockDim.x + threadIdx.x));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 * blockDim.x + threadIdx.x));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 * blockDim.x + threadIdx.x));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 * blockDim.x + threadIdx.x));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 * blockDim.x + threadIdx.x));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz; 
 
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 * blockDim.x + threadIdx.x));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz; 
-          atomicAdd((Gjk + 2 * ldG), tw);
-
-          atomicAdd((Gik + 0 * ldG), outBuffer[0]);
-          atomicAdd((Gik + 1 * ldG), outBuffer[1]);
-          atomicAdd((Gik + 2 * ldG), outBuffer[2]);
-          atomicAdd((Gik + 3 * ldG), outBuffer[3]);
-          atomicAdd((Gik + 4 * ldG), outBuffer[4]);
-          atomicAdd((Gik + 5 * ldG), outBuffer[5]);
+          //atomicAdd((Gjk + 2 * ldG), tw);
+          Gjk_2 += tw;
+
+          if constexpr (pure_ket) {
+            atomicAdd((Gjk + 2 * ldG), Gjk_0);
+            atomicAdd((Gjk + 0 * ldG), Gjk_1);
+            atomicAdd((Gjk + 1 * ldG), Gjk_2);
+          } else {
+            atomicAdd((Gjk + 0 * ldG), Gjk_0);
+            atomicAdd((Gjk + 1 * ldG), Gjk_1);
+            atomicAdd((Gjk + 2 * ldG), Gjk_2);
+          }
+
+          if constexpr (pure_bra) {
+            SCALAR_TYPE Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2;
+              
+            ::cuda::std::tie(Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2) =
+              sph::tform_l2(outBuffer[0], outBuffer[1], outBuffer[2], 
+                            outBuffer[3], outBuffer[4], outBuffer[5]);
+            atomicAdd((Gik + 0 * ldG), Gik_m2);
+            atomicAdd((Gik + 1 * ldG), Gik_m1);
+            atomicAdd((Gik + 2 * ldG), Gik_z0);
+            atomicAdd((Gik + 3 * ldG), Gik_p1);
+            atomicAdd((Gik + 4 * ldG), Gik_p2);
+          } else {
+            atomicAdd((Gik + 0 * ldG), outBuffer[0]);
+            atomicAdd((Gik + 1 * ldG), outBuffer[1]);
+            atomicAdd((Gik + 2 * ldG), outBuffer[2]);
+            atomicAdd((Gik + 3 * ldG), outBuffer[3]);
+            atomicAdd((Gik + 4 * ldG), outBuffer[4]);
+            atomicAdd((Gik + 5 * ldG), outBuffer[5]);
+          }
         }
       }
     }
@@ -1836,15 +1866,38 @@ struct DeviceTask21 {
 };
 
 template <int primpair_limit>
-using AM21_swap = DeviceTask21<ObaraSaikaType::swap, 
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM21_swap_cart = DeviceTask21<ObaraSaikaType::swap, 
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false, false>;
+
+template <int primpair_limit>
+using AM21_cart = DeviceTask21<ObaraSaikaType::base, 
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false, false>;
+
+template <int primpair_limit>
+using AM21_swap_sc = DeviceTask21<ObaraSaikaType::swap, 
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, false>;
+
+template <int primpair_limit>
+using AM21_sc = DeviceTask21<ObaraSaikaType::base, 
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, false>;
+
+template <int primpair_limit>
+using AM21_swap_sph = DeviceTask21<ObaraSaikaType::swap, 
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, true>;
 
 template <int primpair_limit>
-using AM21 = DeviceTask21<ObaraSaikaType::base, 
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM21_sph = DeviceTask21<ObaraSaikaType::base, 
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, true>;
 
   void integral_2_1_task_batched(
     bool swap,
+    bool sph_2, bool sph_1,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -1865,21 +1918,55 @@ using AM21 = DeviceTask21<ObaraSaikaType::base,
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
 
     if (swap) {
-      dev_integral_task_map_dispatcher<AM21_swap>(
-        nblocks, nthreads, max_primpair, stream, 
-        ntasks, nsubtask,
-        device_tasks, task2sp, 
-        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-        boys_table );
+      if(sph_2 and sph_1)
+        dev_integral_task_map_dispatcher<AM21_swap_sph>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else if(sph_2)
+        dev_integral_task_map_dispatcher<AM21_swap_sc>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else
+        dev_integral_task_map_dispatcher<AM21_swap_cart>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
     } else {
-      dev_integral_task_map_dispatcher<AM21>(
-        nblocks, nthreads, max_primpair, stream, 
-        ntasks, nsubtask,
-        device_tasks, task2sp, 
-        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-        boys_table );
+      if(sph_2 and sph_1)
+        dev_integral_task_map_dispatcher<AM21_sph>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else if(sph_2)
+        dev_integral_task_map_dispatcher<AM21_sc>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
+      else
+        dev_integral_task_map_dispatcher<AM21_cart>(
+          nblocks, nthreads, max_primpair, stream, 
+          ntasks, nsubtask,
+          device_tasks, task2sp, 
+          (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+          sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+          boys_table );
     }
   }
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu
index e501329c..e0038e32 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -39,6 +43,7 @@ namespace XGPU {
 
   void integral_2_1_task_batched(
     bool swap,
+    bool sph_2, bool sph_1,
     size_t ntasks,
     size_t nsubtasks,
     int max_primpairs, size_t max_nsp,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu
index 2216ede1..88c18b71 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -3106,7 +3110,8 @@ using namespace GauXC;
 
   }
 
-template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_>
+template<ObaraSaikaType type_, int points_per_subtask_, int primpair_shared_limit_,
+         bool pure_bra, bool pure_ket>
 struct DeviceTask22 {
   static constexpr int max_primpair_shared_limit = 8;
 
@@ -3120,7 +3125,7 @@ struct DeviceTask22 {
 
   static constexpr bool use_shared = (primpair_shared_limit > 0) && 
                                      (primpair_shared_limit <= max_primpair_shared_limit);
-  static constexpr int num_warps = points_per_subtask / cuda::warp_size;
+  static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size;
   // Cannot declare shared memory array with length 0
   static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1;
 
@@ -3150,8 +3155,8 @@ struct DeviceTask22 {
     const double Y_AB = param.Y_AB;
     const double Z_AB = param.Z_AB;
 
-    const int laneId = threadIdx.x % cuda::warp_size;
-    const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size;
+    const int laneId = threadIdx.x % GauXC::cuda::warp_size;
+    const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size;
 
     __shared__ GauXC::PrimitivePair<double> s_prim_pairs[prim_buffer_size] __attribute__((unused));
 
@@ -3173,7 +3178,7 @@ struct DeviceTask22 {
 
       for(int j = 0; j < 31; ++j) SCALAR_STORE((temp + j), SCALAR_ZERO());
 
-      const int pointIndex = i * cuda::warp_size + laneId;
+      const int pointIndex = i * GauXC::cuda::warp_size + laneId;
 
       if (pointIndex < npts) {
         const double point_x = s_task_data[pointIndex].x;
@@ -3592,58 +3597,99 @@ struct DeviceTask22 {
           SCALAR_TYPE const_value_w;
           SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5;
 
+          SCALAR_TYPE Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5;
+          SCALAR_TYPE Xjk_0, Xjk_1, Xjk_2, Xjk_3, Xjk_4, Xjk_5;
+          SCALAR_TYPE Gjk_0, Gjk_1, Gjk_2, Gjk_3, Gjk_4, Gjk_5;
+
+          if constexpr (pure_bra) {
+            SCALAR_TYPE Xik_m2 = SCALAR_LOAD((Xik + 0*ldX));
+            SCALAR_TYPE Xik_m1 = SCALAR_LOAD((Xik + 1*ldX));
+            SCALAR_TYPE Xik_z0 = SCALAR_LOAD((Xik + 2*ldX));
+            SCALAR_TYPE Xik_p1 = SCALAR_LOAD((Xik + 3*ldX));
+            SCALAR_TYPE Xik_p2 = SCALAR_LOAD((Xik + 4*ldX));
+
+            ::cuda::std::tie(Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5) =
+              sph::itform_l2(Xik_m2, Xik_m1, Xik_z0, Xik_p1, Xik_p2);
+          } else {
+            Xik_0 = SCALAR_LOAD((Xik + 0*ldX));
+            Xik_1 = SCALAR_LOAD((Xik + 1*ldX));
+            Xik_2 = SCALAR_LOAD((Xik + 2*ldX));
+            Xik_3 = SCALAR_LOAD((Xik + 3*ldX));
+            Xik_4 = SCALAR_LOAD((Xik + 4*ldX));
+            Xik_5 = SCALAR_LOAD((Xik + 5*ldX));
+          }
+
+          if constexpr (pure_ket) {
+            SCALAR_TYPE Xjk_m2 = SCALAR_LOAD((Xjk + 0*ldX));
+            SCALAR_TYPE Xjk_m1 = SCALAR_LOAD((Xjk + 1*ldX));
+            SCALAR_TYPE Xjk_z0 = SCALAR_LOAD((Xjk + 2*ldX));
+            SCALAR_TYPE Xjk_p1 = SCALAR_LOAD((Xjk + 3*ldX));
+            SCALAR_TYPE Xjk_p2 = SCALAR_LOAD((Xjk + 4*ldX));
+
+            ::cuda::std::tie(Xjk_0, Xjk_1, Xjk_2, Xjk_3, Xjk_4, Xjk_5) =
+              sph::itform_l2(Xjk_m2, Xjk_m1, Xjk_z0, Xjk_p1, Xjk_p2);
+          } else {
+            Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX));
+            Xjk_1 = SCALAR_LOAD((Xjk + 1*ldX));
+            Xjk_2 = SCALAR_LOAD((Xjk + 2*ldX));
+            Xjk_3 = SCALAR_LOAD((Xjk + 3*ldX));
+            Xjk_4 = SCALAR_LOAD((Xjk + 4*ldX));
+            Xjk_5 = SCALAR_LOAD((Xjk + 5*ldX));
+          }
+
+          Gjk_0 = 0;
+          Gjk_1 = 0;
+          Gjk_2 = 0;
+          Gjk_3 = 0;
+          Gjk_4 = 0;
+          Gjk_5 = 0;
+
           X_ABp = 1.0; comb_m_i = 1.0;
           Y_ABp = 1.0; comb_n_j = 1.0;
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 0 * ldX));
+          tx = Xik_0;
+          ty = Xjk_0;
           t0 = SCALAR_LOAD((temp + 16 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 17 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 18 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 19 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 20 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 21 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1));
@@ -3651,52 +3697,46 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 6 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 7 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 8 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 9 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 10 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 11 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2));
@@ -3704,54 +3744,49 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 0 * ldG), tw);
+          //if constexpr (!diag) atomicAdd((Gjk + 0 * ldG), tw);
+          if constexpr (!diag) Gjk_0 += tw;
     
 
 
@@ -3760,105 +3795,93 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 1 * ldX));
+          tx = Xik_0;
+          ty = Xjk_1;
           t0 = SCALAR_LOAD((temp + 17 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 19 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 20 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 22 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 23 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 24 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1));
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 6 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 7 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 8 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 9 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 10 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 11 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1));
@@ -3866,106 +3889,95 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 7 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 9 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 10 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 12 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 13 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 14 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1));
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 1 * ldG), tw);
+          //if constexpr (!diag) atomicAdd((Gjk + 1 * ldG), tw);
+          if constexpr (!diag) Gjk_1 += tw;
 
 
 
@@ -3975,104 +3987,92 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 2 * ldX));
+          tx = Xik_0;
+          ty = Xjk_2;
           t0 = SCALAR_LOAD((temp + 18 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 20 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 21 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 23 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 24 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 25 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 6 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 7 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 8 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 9 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 10 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 11 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1));
@@ -4080,105 +4080,94 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 8 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 10 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 11 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 13 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 14 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 15 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 2 * ldG), tw);
+          //if constexpr (!diag) atomicAdd((Gjk + 2 * ldG), tw);
+          if constexpr (!diag) Gjk_2 += tw;
 
 
 
@@ -4189,159 +4178,142 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 3 * ldX));
+          tx = Xik_0;
+          ty = Xjk_3;
           t0 = SCALAR_LOAD((temp + 19 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 22 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 23 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 26 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 27 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 28 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1));
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 7 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 9 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 10 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 12 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 13 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 14 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2));
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 3 * ldG), tw);
+          //if constexpr (!diag) atomicAdd((Gjk + 3 * ldG), tw);
+          if constexpr (!diag) Gjk_3 += tw;
 
 
 
@@ -4350,209 +4322,186 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 4 * ldX));
+          tx = Xik_0;
+          ty = Xjk_4;
           t0 = SCALAR_LOAD((temp + 20 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 23 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 24 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 27 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 28 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 29 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 7 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 9 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 10 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 12 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 13 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 14 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1));
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 8 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 10 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 11 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 13 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 14 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 15 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 4 * ldG), tw);
+          //if constexpr (!diag) atomicAdd((Gjk + 4 * ldG), tw);
+          if constexpr (!diag) Gjk_4 += tw;
 
 
 
@@ -4562,164 +4511,181 @@ struct DeviceTask22 {
           Z_ABp = 1.0; comb_p_k = 1.0;
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
-          ty = SCALAR_LOAD((Xjk + 5 * ldX));
+          tx = Xik_0;
+          ty = Xjk_5;
           t0 = SCALAR_LOAD((temp + 21 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_MUL(tx, t0);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 24 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 25 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 28 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 29 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 30 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 8 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 10 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 11 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 13 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 14 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 15 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
                                   
           Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2));
           const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp;
           const_value_w = SCALAR_MUL(const_value_v, const_value);
-          tx = SCALAR_LOAD((Xik + 0 * ldX));
+          tx = Xik_0;
           t0 = SCALAR_LOAD((temp + 0 ));
           t0 = SCALAR_MUL(t0, const_value_w);
           tz = SCALAR_MUL(ty, t0);
           tw = SCALAR_FMA(tx, t0, tw);
-          //atomicAdd((Gik + 0 * ldG), tz);
           outBuffer[0] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 1 * ldX));
+          tx = Xik_1;
           t1 = SCALAR_LOAD((temp + 1 ));
           t1 = SCALAR_MUL(t1, const_value_w);
           tz = SCALAR_MUL(ty, t1);
           tw = SCALAR_FMA(tx, t1, tw);
-          //atomicAdd((Gik + 1 * ldG), tz);
           outBuffer[1] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 2 * ldX));
+          tx = Xik_2;
           t2 = SCALAR_LOAD((temp + 2 ));
           t2 = SCALAR_MUL(t2, const_value_w);
           tz = SCALAR_MUL(ty, t2);
           tw = SCALAR_FMA(tx, t2, tw);
-          //atomicAdd((Gik + 2 * ldG), tz);
           outBuffer[2] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 3 * ldX));
+          tx = Xik_3;
           t3 = SCALAR_LOAD((temp + 3 ));
           t3 = SCALAR_MUL(t3, const_value_w);
           tz = SCALAR_MUL(ty, t3);
           tw = SCALAR_FMA(tx, t3, tw);
-          //atomicAdd((Gik + 3 * ldG), tz);
           outBuffer[3] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 4 * ldX));
+          tx = Xik_4;
           t4 = SCALAR_LOAD((temp + 4 ));
           t4 = SCALAR_MUL(t4, const_value_w);
           tz = SCALAR_MUL(ty, t4);
           tw = SCALAR_FMA(tx, t4, tw);
-          //atomicAdd((Gik + 4 * ldG), tz);
           outBuffer[4] += tz;
                                   
-          tx = SCALAR_LOAD((Xik + 5 * ldX));
+          tx = Xik_5;
           t5 = SCALAR_LOAD((temp + 5 ));
           t5 = SCALAR_MUL(t5, const_value_w);
           tz = SCALAR_MUL(ty, t5);
           tw = SCALAR_FMA(tx, t5, tw);
-          //atomicAdd((Gik + 5 * ldG), tz);
           outBuffer[5] += tz;
-          if constexpr (!diag) atomicAdd((Gjk + 5 * ldG), tw);
-
-          atomicAdd((Gik + 0 * ldG), outBuffer[0]);
-          atomicAdd((Gik + 1 * ldG), outBuffer[1]);
-          atomicAdd((Gik + 2 * ldG), outBuffer[2]);
-          atomicAdd((Gik + 3 * ldG), outBuffer[3]);
-          atomicAdd((Gik + 4 * ldG), outBuffer[4]);
-          atomicAdd((Gik + 5 * ldG), outBuffer[5]);
+          //if constexpr (!diag) atomicAdd((Gjk + 5 * ldG), tw);
+          if constexpr (!diag) Gjk_5 += tw;
+
+          if constexpr (!diag) {
+            if constexpr (pure_ket) {
+              SCALAR_TYPE Gjk_m2, Gjk_m1, Gjk_z0, Gjk_p1, Gjk_p2;
+              
+              ::cuda::std::tie(Gjk_m2, Gjk_m1, Gjk_z0, Gjk_p1, Gjk_p2) =
+                sph::tform_l2(Gjk_0, Gjk_1, Gjk_2, Gjk_3, Gjk_4, Gjk_5);
+              atomicAdd((Gjk + 0 * ldG), Gjk_m2);
+              atomicAdd((Gjk + 1 * ldG), Gjk_m1);
+              atomicAdd((Gjk + 2 * ldG), Gjk_z0);
+              atomicAdd((Gjk + 3 * ldG), Gjk_p1);
+              atomicAdd((Gjk + 4 * ldG), Gjk_p2);
+            } else {
+              atomicAdd((Gjk + 0 * ldG), Gjk_0);
+              atomicAdd((Gjk + 1 * ldG), Gjk_1);
+              atomicAdd((Gjk + 2 * ldG), Gjk_2);
+              atomicAdd((Gjk + 3 * ldG), Gjk_3);
+              atomicAdd((Gjk + 4 * ldG), Gjk_4);
+              atomicAdd((Gjk + 5 * ldG), Gjk_5);
+            }
+          }
+
+          if constexpr (pure_bra) {
+            SCALAR_TYPE Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2;
+              
+            ::cuda::std::tie(Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2) =
+              sph::tform_l2(outBuffer[0], outBuffer[1], outBuffer[2], 
+                            outBuffer[3], outBuffer[4], outBuffer[5]);
+            atomicAdd((Gik + 0 * ldG), Gik_m2);
+            atomicAdd((Gik + 1 * ldG), Gik_m1);
+            atomicAdd((Gik + 2 * ldG), Gik_z0);
+            atomicAdd((Gik + 3 * ldG), Gik_p1);
+            atomicAdd((Gik + 4 * ldG), Gik_p2);
+          } else {
+            atomicAdd((Gik + 0 * ldG), outBuffer[0]);
+            atomicAdd((Gik + 1 * ldG), outBuffer[1]);
+            atomicAdd((Gik + 2 * ldG), outBuffer[2]);
+            atomicAdd((Gik + 3 * ldG), outBuffer[3]);
+            atomicAdd((Gik + 4 * ldG), outBuffer[4]);
+            atomicAdd((Gik + 5 * ldG), outBuffer[5]);
+          }
         }
       }
     }
@@ -4728,14 +4694,27 @@ struct DeviceTask22 {
 };
 
 template <int primpair_limit>
-using AM22 = DeviceTask22<ObaraSaikaType::base,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM22_cart = DeviceTask22<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false, false>;
+
+template <int primpair_limit>
+using AM2_cart = DeviceTask22<ObaraSaikaType::diag,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, false, false>;
+
+template <int primpair_limit>
+using AM22_sph = DeviceTask22<ObaraSaikaType::base,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, true>;
 
 template <int primpair_limit>
-using AM2 = DeviceTask22<ObaraSaikaType::diag,
-  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, primpair_limit>;
+using AM2_sph = DeviceTask22<ObaraSaikaType::diag,
+  alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask, 
+  primpair_limit, true, true>;
 
   void integral_2_2_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -4755,16 +4734,26 @@ using AM2 = DeviceTask22<ObaraSaikaType::diag,
     dim3 nblocks(nblocks_x, nblocks_y, nblocks_z);
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
     
-    dev_integral_task_map_dispatcher<AM22>(
-      nblocks, nthreads, max_primpair, stream, 
-      ntasks, nsubtask,
-      device_tasks, task2sp, 
-      (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-      sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-      boys_table );
+    if(sph)    
+      dev_integral_task_map_dispatcher<AM22_sph>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
+    else
+      dev_integral_task_map_dispatcher<AM22_cart>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
   }
 
   void integral_2_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpair, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -4784,13 +4773,22 @@ using AM2 = DeviceTask22<ObaraSaikaType::diag,
     dim3 nblocks(nblocks_x, nblocks_y, nblocks_z);
     dim3 nthreads(alg_constants::CudaAoSScheme1::ObaraSaika::points_per_subtask);
     
-    dev_integral_task_map_dispatcher<AM2>(
-      nblocks, nthreads, max_primpair, stream, 
-      ntasks, nsubtask,
-      device_tasks, task2sp, 
-      (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
-      sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
-      boys_table );
+    if(sph)
+      dev_integral_task_map_dispatcher<AM2_sph>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
+    else
+      dev_integral_task_map_dispatcher<AM2_cart>(
+        nblocks, nthreads, max_primpair, stream, 
+        ntasks, nsubtask,
+        device_tasks, task2sp, 
+        (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device,
+        sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device,
+        boys_table );
   }
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu
index 4c0c42a6..12fe23e3 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -38,6 +42,7 @@ namespace XGPU {
         cudaStream_t stream); 
 
   void integral_2_2_task_batched(
+    bool sph,
     size_t ntasks,
     size_t nsubtasks,
     int max_primpairs, size_t max_nsp,
@@ -53,6 +58,7 @@ namespace XGPU {
     cudaStream_t stream);
 
   void integral_2_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpairs, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu
index dc975f55..267c195a 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu
index ff88fb9f..df85fa52 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu
@@ -1,8 +1,47 @@
 #include "device_specific/cuda_device_constants.hpp"
 #include "../../cuda_aos_scheme1.hpp"
+#include <tuple>
+#include <cuda/std/tuple>
 
 namespace XGPU {
 
+namespace constants {
+  constexpr double sqrt_3 = 1.7320508075688772;
+}
+
+namespace sph {
+
+__inline__ __device__ auto tform_l2(
+  double xx, double xy, double xz, double yy, double yz, double zz
+) {
+
+  double m2 = constants::sqrt_3 * xy;
+  double m1 = constants::sqrt_3 * yz;
+  double z0 = zz - 0.5 * (xx + yy);
+  double p1 = constants::sqrt_3 * xz;
+  double p2 = constants::sqrt_3 * 0.5 * (xx - yy);
+
+  return cuda::std::make_tuple(m2, m1, z0, p1, p2);
+
+}
+
+__inline__ __device__ auto itform_l2(
+  double m2, double m1, double z0, double p1, double p2
+) {
+
+  double xx = 0.5 * (-z0 + constants::sqrt_3 * p2);
+  double xy = constants::sqrt_3 * m2;
+  double xz = constants::sqrt_3 * p1;
+  double yy = -0.5 * (z0 + constants::sqrt_3 * p2);
+  double yz = constants::sqrt_3 * m1;
+  double zz = z0;
+
+  return cuda::std::make_tuple(xx,xy,xz,yy,yz,zz);
+
+}
+
+}
+
 using namespace GauXC;
 
 
@@ -15,7 +54,7 @@ __inline__ __device__ void load_primpair_shared(
   int32_t* dst = (int32_t*) dst_t;
   const int num_transfers = n * sizeof(GauXC::PrimitivePair<double>) / sizeof(int32_t);
 
-  for (int i = laneId; i < num_transfers; i += cuda::warp_size) {
+  for (int i = laneId; i < num_transfers; i += GauXC::cuda::warp_size) {
     dst[i] = src[i]; 
   }
 }
@@ -110,7 +149,7 @@ void task_map_kernel(
 
   __shared__ double4 s_task_data[points_per_subtask];
 
-  const int warpId = threadIdx.x / cuda::warp_size;
+  const int warpId = threadIdx.x / GauXC::cuda::warp_size;
   
   const int i_subtask = blockIdx.x;
   const int i_task = subtasks[i_subtask].x;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt
index bd78bb84..93b1b589 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx
index 3c11a5fe..a51e6bcd 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -16,12 +20,10 @@
 
 namespace GauXC {
 
-void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ){
-
-  if( do_grad ) GAUXC_GENERIC_EXCEPTION("CUTLASS + X Gradient NYI");
-  if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI");
-
-  auto* data = dynamic_cast<Data*>(_data);
+// Common implementation for eval_xmat and eval_xmat_trial
+template<bool is_trial>
+void AoSScheme1CUTLASSBase::eval_xmat_impl(double fac, XCDeviceData* _data, bool do_grad, density_id den_id) {
+  auto* data = dynamic_cast<AoSScheme1CUTLASSBase::Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
@@ -34,33 +36,91 @@ void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_g
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device, 
+  
+  double* dmat_ptr;
+  if constexpr (is_trial) {
+    dmat_ptr = static_stack.tden_selector(den_id);
+    // now screened trial density matrix is stored in aos_stack.device_tasks[itask].nbe_scr
+  } else {
+    dmat_ptr = static_stack.den_selector(den_id);
+  }
+  
+  sym_pack_submat( ntasks, aos_stack.device_tasks, dmat_ptr, 
     nbf, submat_block_size, data->device_backend_->queue() );
 
   auto cutlass_stack = data->cutlass_stack;
+  double** dmat_array;
+  if constexpr (is_trial) {
+    dmat_array = cutlass_stack.tdmat_array(den_id);
+  } else {
+    dmat_array = cutlass_stack.dmat_array(den_id);
+  }
   cutlass_gemm(
     cutlass_stack.problem_sizes_device,
     data->problem_sizes_host.data(),
     ntasks,
-    cutlass_stack.bf_array_device, cutlass_stack.dmat_array_device,
+    cutlass_stack.bf_array_device, dmat_array,
     cutlass_stack.zmat_array_device, cutlass_stack.zmat_array_device,
     cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device,
     cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device,
     fac, 0.0,
     data->device_backend_->queue()
   );
+
+  if(do_grad) {
+    cutlass_gemm(
+      cutlass_stack.problem_sizes_device,
+      data->problem_sizes_host.data(),
+      ntasks,
+      cutlass_stack.bfx_array_device, dmat_array,
+      cutlass_stack.xmat_x_array_device, cutlass_stack.xmat_x_array_device,
+      cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device,
+      cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device,
+      fac, 0.0,
+      data->device_backend_->queue()
+    );
+    cutlass_gemm(
+      cutlass_stack.problem_sizes_device,
+      data->problem_sizes_host.data(),
+      ntasks,
+      cutlass_stack.bfy_array_device, dmat_array,
+      cutlass_stack.xmat_y_array_device, cutlass_stack.xmat_y_array_device,
+      cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device,
+      cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device,
+      fac, 0.0,
+      data->device_backend_->queue()
+    );
+    cutlass_gemm(
+      cutlass_stack.problem_sizes_device,
+      data->problem_sizes_host.data(),
+      ntasks,
+      cutlass_stack.bfz_array_device, dmat_array,
+      cutlass_stack.xmat_z_array_device, cutlass_stack.xmat_z_array_device,
+      cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device,
+      cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device,
+      fac, 0.0,
+      data->device_backend_->queue()
+    );
+  }
+}
+
+void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ) {
+  eval_xmat_impl<false>(fac, _data, do_grad, den_id);
 }
 
-void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id,  bool do_m){
+void AoSScheme1CUTLASSBase::eval_xmat_trial(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ) {
+  eval_xmat_impl<true>(fac, _data, do_grad, den_id);
+}
 
-  auto* data = dynamic_cast<Data*>(_data);
+
+// Common implementation for inc_vxc and inc_fxc
+template<bool is_fxc>
+void AoSScheme1CUTLASSBase::inc_potential_impl(XCDeviceData* _data, density_id den_id, bool do_m) {
+  auto* data = dynamic_cast<AoSScheme1CUTLASSBase::Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  if(do_m) GAUXC_GENERIC_EXCEPTION("CUTLASS + MGGA NYI");
-  if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI");
-
   auto& tasks = data->host_device_tasks;
   const auto ntasks = tasks.size();
 
@@ -76,15 +136,66 @@ void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id,  bo
     1.0, 0.0,
     data->device_backend_->queue()
   );
-
-  // Increment global VXC
+  if(do_m) {
+    cutlass_syr2k(
+      cutlass_stack.syr2k_sizes_device,
+      data->syr2k_sizes_host.data(),
+      ntasks,
+      cutlass_stack.bfx_array_device, cutlass_stack.xmat_x_array_device,
+      cutlass_stack.vmat_array_device, cutlass_stack.vmat_array_device,
+      cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_zmat_array_device,
+      cutlass_stack.ld64_vmat_array_device, cutlass_stack.ld64_vmat_array_device,
+      1.0, 1.0,
+      data->device_backend_->queue()
+    );
+    cutlass_syr2k(
+      cutlass_stack.syr2k_sizes_device,
+      data->syr2k_sizes_host.data(),
+      ntasks,
+      cutlass_stack.bfy_array_device, cutlass_stack.xmat_y_array_device,
+      cutlass_stack.vmat_array_device, cutlass_stack.vmat_array_device,
+      cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_zmat_array_device,
+      cutlass_stack.ld64_vmat_array_device, cutlass_stack.ld64_vmat_array_device,
+      1.0, 1.0,
+      data->device_backend_->queue()
+    );
+    cutlass_syr2k(
+      cutlass_stack.syr2k_sizes_device,
+      data->syr2k_sizes_host.data(),
+      ntasks,
+      cutlass_stack.bfz_array_device, cutlass_stack.xmat_z_array_device,
+      cutlass_stack.vmat_array_device, cutlass_stack.vmat_array_device,
+      cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_zmat_array_device,
+      cutlass_stack.ld64_vmat_array_device, cutlass_stack.ld64_vmat_array_device,
+      1.0, 1.0,
+      data->device_backend_->queue()
+    );
+  }
+
+  // Increment global VXC/FXC
   const auto nbf = data->global_dims.nbf;
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  sym_task_inc_potential( ntasks, aos_stack.device_tasks, 
-    static_stack.vxc_s_device, nbf, submat_block_size, 
-    data->device_backend_->queue() );
+  
+  double* potential_ptr;
+  if constexpr (is_fxc) {
+    potential_ptr = static_stack.fxc_selector(den_id);
+    // cutlass_stack.vmat_array_device points to aos_stack.device_tasks[itask].nbe_scr
+  } else {
+    potential_ptr = static_stack.vxc_selector(den_id);
+  }
+  
+  sym_task_inc_potential( ntasks, aos_stack.device_tasks, potential_ptr, nbf, 
+    submat_block_size, data->device_backend_->queue() );
+}
+
+void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id, bool do_m ) {
+  inc_potential_impl<false>(_data, den_id, do_m);
+}
+
+void AoSScheme1CUTLASSBase::inc_fxc( XCDeviceData* _data, density_id den_id, bool do_m ) {
+  inc_potential_impl<true>(_data, den_id, do_m);
 }
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp
index 0f3ec69e..80b99116 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -17,8 +21,16 @@ namespace GauXC {
 
 struct AoSScheme1CUTLASSBase : public AoSScheme1Base {
 
+  template<bool is_trial>
+  void eval_xmat_impl(double fac, XCDeviceData*, bool do_grad, density_id );
+  template<bool is_fxc>
+  void inc_potential_impl(XCDeviceData*, density_id, bool do_m);
+
+
   void eval_xmat(double fac, XCDeviceData*, bool do_grad, density_id ) override final;
+  void eval_xmat_trial(double fac, XCDeviceData*, bool do_grad, density_id ) override final;
   void inc_vxc( XCDeviceData*, density_id, bool ) override final;
+  void inc_fxc( XCDeviceData*, density_id, bool ) override final;
 
   struct Data;
 
@@ -32,11 +44,44 @@ struct AoSScheme1CUTLASSBase::Data : public AoSScheme1Base::Data {
   using base_type::device_buffer_t;
 
   struct cutlass_data {
-    double** dmat_array_device = nullptr;
+    double** dmat_s_array_device = nullptr;
+    double** dmat_z_array_device = nullptr;
+    double** dmat_y_array_device = nullptr;
+    double** dmat_x_array_device = nullptr;
     double** vmat_array_device = nullptr;
     double** zmat_array_device = nullptr;
     double** bf_array_device   = nullptr;
-
+    double** bfx_array_device   = nullptr;
+    double** bfy_array_device   = nullptr;
+    double** bfz_array_device   = nullptr;
+    double** xmat_x_array_device   = nullptr;
+    double** xmat_y_array_device   = nullptr;
+    double** xmat_z_array_device   = nullptr;
+
+    double** tdmat_s_array_device = nullptr;
+    double** tdmat_z_array_device = nullptr;
+    double** tdmat_y_array_device = nullptr;
+    double** tdmat_x_array_device = nullptr;
+
+    inline double** dmat_array(density_id id) {
+      switch(id) {
+        case DEN_S: return dmat_s_array_device;
+        case DEN_Z: return dmat_z_array_device;
+        case DEN_Y: return dmat_y_array_device;
+        case DEN_X: return dmat_x_array_device;
+        default: GAUXC_GENERIC_EXCEPTION("dmat_array: density_id not recognized");
+      }
+    }
+
+    inline double** tdmat_array(density_id id) {
+      switch(id) {
+        case DEN_S: return tdmat_s_array_device;
+        case DEN_Z: return tdmat_z_array_device;
+        case DEN_Y: return tdmat_y_array_device;
+        case DEN_X: return tdmat_x_array_device;
+        default: GAUXC_GENERIC_EXCEPTION("dmat_array: density_id not recognized");
+      }
+    }
                            
     cutlass::gemm::GemmCoord* problem_sizes_device = nullptr;
     cutlass::gemm::GemmCoord* syr2k_sizes_device = nullptr;
diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx
index 6bf35c75..3e5ee555 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -28,6 +32,8 @@ size_t AoSScheme1CUTLASSBase::Data::get_static_mem_requirement() {
 size_t AoSScheme1CUTLASSBase::Data::get_mem_req( integrator_term_tracker terms, 
   const host_task_type& task ) {
 
+  auto is_uks = terms.ks_scheme == UKS;
+  auto is_gks = terms.ks_scheme == GKS;
   
   size_t base_size = base_type::get_mem_req(terms, task);
 
@@ -35,10 +41,30 @@ size_t AoSScheme1CUTLASSBase::Data::get_mem_req( integrator_term_tracker terms,
   required_term_storage reqt(terms);
   if( reqt.task_nbe_scr ) {
     base_size += 
-      4*sizeof(double*) + // batch device pointers
+      4*sizeof(double*) + // batch device pointers (containg trial ones)
       4*sizeof(int64_t) +
       2*sizeof(cutlass::gemm::GemmCoord);  // Dimensions + leading dimensions 
                                            // (extra handled by get_static_mem_requirement)
+    if(reqt.task_xmat_grad) {
+      base_size += 6 * sizeof(double*);
+    }
+
+    if(is_uks or is_gks) {
+      base_size += sizeof(double*); // z dmat 
+    }
+    if(is_gks) {
+      base_size += 2*sizeof(double*); // x/y dmat 
+    }
+    
+    if(terms.fxc_contraction) {
+      base_size += sizeof(double*); // s tdmat
+      if(is_uks or is_gks)
+        base_size += sizeof(double*); // z tdmat
+      if(is_gks) {
+        base_size += 2*sizeof(double*); // x/y tdmat
+      }
+    }
+
   }
   return base_size;
 
@@ -57,23 +83,53 @@ AoSScheme1CUTLASSBase::Data::device_buffer_t
   required_term_storage reqt(terms);
   if( not reqt.task_nbe_scr ) return buf;
 
+  auto is_uks = terms.ks_scheme == UKS;
+  auto is_gks = terms.ks_scheme == GKS;
+
   // Allocate additional device memory 
   auto [ ptr, sz ] = buf;
   buffer_adaptor mem( ptr, sz );
 
   const auto ntask = std::distance( task_begin, task_end );
-  cutlass_stack.dmat_array_device = mem.aligned_alloc<double*>( ntask, csl );
-  cutlass_stack.vmat_array_device = mem.aligned_alloc<double*>( ntask, csl );
-  cutlass_stack.zmat_array_device = mem.aligned_alloc<double*>( ntask, csl );
-  cutlass_stack.bf_array_device   = mem.aligned_alloc<double*>( ntask, csl );
+  cutlass_stack.dmat_s_array_device = mem.aligned_alloc<double*>( ntask, csl );
+  cutlass_stack.vmat_array_device   = mem.aligned_alloc<double*>( ntask, csl );
+  cutlass_stack.zmat_array_device   = mem.aligned_alloc<double*>( ntask, csl );
+  cutlass_stack.bf_array_device     = mem.aligned_alloc<double*>( ntask, csl );
+  if(reqt.task_xmat_grad) {
+    cutlass_stack.bfx_array_device    = mem.aligned_alloc<double*>( ntask, csl );
+    cutlass_stack.bfy_array_device    = mem.aligned_alloc<double*>( ntask, csl );
+    cutlass_stack.bfz_array_device    = mem.aligned_alloc<double*>( ntask, csl );
+    cutlass_stack.xmat_x_array_device = mem.aligned_alloc<double*>( ntask, csl );
+    cutlass_stack.xmat_y_array_device = mem.aligned_alloc<double*>( ntask, csl );
+    cutlass_stack.xmat_z_array_device = mem.aligned_alloc<double*>( ntask, csl );
+  }
+
+  if(is_uks or is_gks) {
+    cutlass_stack.dmat_z_array_device = mem.aligned_alloc<double*>( ntask, csl );
+  }
+
+  if(is_gks) {
+    cutlass_stack.dmat_y_array_device = mem.aligned_alloc<double*>( ntask, csl );
+    cutlass_stack.dmat_x_array_device = mem.aligned_alloc<double*>( ntask, csl );
+  }
+
+  if(terms.fxc_contraction) {
+    cutlass_stack.tdmat_s_array_device = mem.aligned_alloc<double*>( ntask, csl );
+    if(is_uks or is_gks)
+      cutlass_stack.tdmat_z_array_device = mem.aligned_alloc<double*>( ntask, csl );
+    if(is_gks){
+      cutlass_stack.tdmat_y_array_device = mem.aligned_alloc<double*>( ntask, csl );
+      cutlass_stack.tdmat_x_array_device = mem.aligned_alloc<double*>( ntask, csl );
+    }
+  }
 
   cutlass_stack.ld64_dmat_array_device = mem.aligned_alloc<int64_t>( ntask + 1, csl );
   cutlass_stack.ld64_zmat_array_device = mem.aligned_alloc<int64_t>( ntask + 1, csl );
   cutlass_stack.ld64_vmat_array_device = mem.aligned_alloc<int64_t>( ntask + 1, csl );
   cutlass_stack.ld64_bf_array_device   = mem.aligned_alloc<int64_t>( ntask + 1, csl );
   
-  cutlass_stack.problem_sizes_device = mem.aligned_alloc<cutlass::gemm::GemmCoord>(  ntask + 1, csl );
-  cutlass_stack.syr2k_sizes_device   = mem.aligned_alloc<cutlass::gemm::GemmCoord>(  ntask + 1, csl );
+  cutlass_stack.problem_sizes_device = mem.aligned_alloc<cutlass::gemm::GemmCoord>( ntask + 1, csl );
+  cutlass_stack.syr2k_sizes_device   = mem.aligned_alloc<cutlass::gemm::GemmCoord>( ntask + 1, csl );
 
   // Update dynmem data for derived impls
   return device_buffer_t{ mem.stack(), mem.nleft() };
@@ -88,15 +144,17 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send(
   required_term_storage reqt(terms);
   if( not reqt.task_nbe_scr ) return;
 
+  auto is_uks = terms.ks_scheme == UKS;
+  auto is_gks = terms.ks_scheme == GKS;
+
   const auto ntask = std::distance( task_begin, task_end );
   std::vector<double*> dmat_host( ntask ), zmat_host( ntask ), bf_host( ntask ),
-                       vmat_host( ntask );
+                       vmat_host( ntask ), tdmat_host( ntask );
   problem_sizes_host.resize(ntask);
   syr2k_sizes_host.resize(ntask);
   std::vector<int64_t> ld64_dmat_host( ntask ), ld64_zmat_host( ntask ), 
                        ld64_vmat_host( ntask ), ld64_bf_host( ntask );
 
-  double* static_dmat = static_stack.dmat_s_device;
   const auto nbf = global_dims.nbf;
 
   // host_device_tasks should be populated by parent impl called at top
@@ -109,7 +167,7 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send(
       dmat_host[i]    = task.nbe_scr;
       ld64_dmat_host[i] = task.bfn_screening.nbe;
     } else {
-      dmat_host[i]    = static_dmat + task.bfn_screening.ibf_begin*(nbf+1);
+      dmat_host[i]    = static_stack.dmat_s_device + task.bfn_screening.ibf_begin*(nbf+1);
       ld64_dmat_host[i] = nbf;
     }
 
@@ -118,12 +176,11 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send(
 
     cutlass::gemm::GemmCoord problem2(task.bfn_screening.nbe, task.bfn_screening.nbe, task.npts);
     syr2k_sizes_host[i] = problem2;
-
   }
 
   // Send to device
   device_backend_->copy_async( ntask, dmat_host.data(), 
-    cutlass_stack.dmat_array_device, "send dmat array" );
+    cutlass_stack.dmat_s_array_device, "send dmat_s array" );
   device_backend_->copy_async( ntask, zmat_host.data(), 
     cutlass_stack.zmat_array_device, "send zmat array" );
   device_backend_->copy_async( ntask, vmat_host.data(), 
@@ -144,6 +201,109 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send(
   device_backend_->copy_async( ntask, ld64_bf_host.data(), 
     cutlass_stack.ld64_bf_array_device, "send ld bf array" );
 
+  if(is_uks or is_gks) {
+    std::vector<double*> dmat_z_host( ntask );
+    for( auto i = 0; i < ntask; ++i ) {
+      auto& task = host_device_tasks[i];
+      if( task.bfn_screening.ncut > 1 ) {
+        dmat_z_host[i] = task.nbe_scr;
+      } else {
+        dmat_z_host[i] = static_stack.dmat_z_device + task.bfn_screening.ibf_begin*(nbf+1);
+      }
+    }
+    device_backend_->copy_async( ntask, dmat_z_host.data(), 
+      cutlass_stack.dmat_z_array_device, "send dmat_z array" );
+  }
+
+  if(is_gks) {
+    std::vector<double*> dmat_y_host( ntask );
+    std::vector<double*> dmat_x_host( ntask );
+    for( auto i = 0; i < ntask; ++i ) {
+      auto& task = host_device_tasks[i];
+      if( task.bfn_screening.ncut > 1 ) {
+        dmat_y_host[i] = task.nbe_scr;
+        dmat_x_host[i] = task.nbe_scr;
+      } else {
+        dmat_y_host[i] = static_stack.dmat_y_device + task.bfn_screening.ibf_begin*(nbf+1);
+        dmat_x_host[i] = static_stack.dmat_x_device + task.bfn_screening.ibf_begin*(nbf+1);
+      }
+    }
+    device_backend_->copy_async( ntask, dmat_x_host.data(), 
+      cutlass_stack.dmat_x_array_device, "send dmat_x array" );
+    device_backend_->copy_async( ntask, dmat_y_host.data(), 
+      cutlass_stack.dmat_y_array_device, "send dmat_y array" );
+  }
+
+  if(reqt.task_xmat_grad) {
+    std::vector<double*> xmat_x_host( ntask ), bfx_host( ntask );
+    std::vector<double*> xmat_y_host( ntask ), bfy_host( ntask );
+    std::vector<double*> xmat_z_host( ntask ), bfz_host( ntask );
+    for( auto i = 0; i < ntask; ++i ) {
+      auto& task = host_device_tasks[i];
+      xmat_x_host[i] = task.xmat_x;
+      xmat_y_host[i] = task.xmat_y;
+      xmat_z_host[i] = task.xmat_z;
+      bfx_host[i]    = task.dbfx;
+      bfy_host[i]    = task.dbfy;
+      bfz_host[i]    = task.dbfz;
+    }
+    device_backend_->copy_async( ntask, xmat_x_host.data(), 
+      cutlass_stack.xmat_x_array_device, "send xmat_x array" );
+    device_backend_->copy_async( ntask, xmat_y_host.data(), 
+      cutlass_stack.xmat_y_array_device, "send xmat_y array" );
+    device_backend_->copy_async( ntask, xmat_z_host.data(), 
+      cutlass_stack.xmat_z_array_device, "send xmat_z array" );
+    device_backend_->copy_async( ntask, bfx_host.data(), 
+      cutlass_stack.bfx_array_device, "send bfx array" );
+    device_backend_->copy_async( ntask, bfy_host.data(), 
+      cutlass_stack.bfy_array_device, "send bfy array" );
+    device_backend_->copy_async( ntask, bfz_host.data(), 
+      cutlass_stack.bfz_array_device, "send bfz array" );
+  }
+
+  if(terms.fxc_contraction) {
+    std::vector<double*> tdmat_host( ntask );
+    for( auto i = 0; i < ntask; ++i ) {
+      auto& task = host_device_tasks[i];
+      if( task.bfn_screening.ncut > 1 )
+        tdmat_host[i] = task.nbe_scr;
+      else 
+        tdmat_host[i] = static_stack.tdmat_s_device + task.bfn_screening.ibf_begin*(nbf+1);
+    }
+    device_backend_->copy_async( ntask, tdmat_host.data(), 
+      cutlass_stack.tdmat_s_array_device, "send tdmat_s array" );
+    if(is_uks or is_gks) {
+      std::vector<double*> tdmat_z_host( ntask );
+      for( auto i = 0; i < ntask; ++i ) {
+        auto& task = host_device_tasks[i];
+        if( task.bfn_screening.ncut > 1 )
+          tdmat_z_host[i] = task.nbe_scr;
+        else 
+          tdmat_z_host[i] = static_stack.tdmat_z_device + task.bfn_screening.ibf_begin*(nbf+1);
+      }
+      device_backend_->copy_async( ntask, tdmat_z_host.data(), 
+        cutlass_stack.tdmat_z_array_device, "send tdmat_z array" );
+    }
+    if(is_gks) {
+      std::vector<double*> tdmat_y_host( ntask );
+      std::vector<double*> tdmat_x_host( ntask );
+      for( auto i = 0; i < ntask; ++i ) {
+        auto& task = host_device_tasks[i];
+        if( task.bfn_screening.ncut > 1 ) {
+          tdmat_y_host[i] = task.nbe_scr;
+          tdmat_x_host[i] = task.nbe_scr;
+        } else {
+          tdmat_y_host[i] = static_stack.tdmat_y_device + task.bfn_screening.ibf_begin*(nbf+1);
+          tdmat_x_host[i] = static_stack.tdmat_x_device + task.bfn_screening.ibf_begin*(nbf+1);
+        }
+      }
+      device_backend_->copy_async( ntask, tdmat_x_host.data(), 
+        cutlass_stack.tdmat_x_array_device, "send tdmat_x array" );
+      device_backend_->copy_async( ntask, tdmat_y_host.data(), 
+        cutlass_stack.tdmat_y_array_device, "send tdmat_y array" );
+    }
+  }
+
   device_backend_->master_queue_synchronize(); 
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx
index a77f134a..838078cf 100644
--- a/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx
+++ b/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -37,4 +41,35 @@ void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts,
 
 }
 
+
+
+void eval_kern_vxc_fxc_lda( const functional_type& func, size_t npts,
+  const double* rho, double* vrho, double* v2rho2, device_queue queue ) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+  func.eval_vxc_fxc_device( npts, rho, vrho, v2rho2, stream );
+}
+
+void eval_kern_vxc_fxc_gga( const functional_type& func, size_t npts,
+  const double* rho, const double* gamma, double* vrho, double* vgamma,
+  double* v2rho2, double* v2rhogamma, double* v2gamma2, device_queue queue ) {
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+  func.eval_vxc_fxc_device( npts, rho, gamma, vrho, vgamma, v2rho2, v2rhogamma, v2gamma2, stream );
+}
+
+void eval_kern_vxc_fxc_mgga( const functional_type& func, size_t npts,
+  const double* rho, const double* gamma, const double* lapl, const double* tau,
+  double* vrho, double* vgamma, double* vlapl, double* vtau,
+  double* v2rho2, double* v2rhogamma, double* v2rholapl, double* v2rhotau,
+  double* v2gamma2, double* v2gammalapl, double* v2gammatau, double* v2lapl2,
+  double* v2lapltau, double* v2tau2, device_queue queue ){
+
+  cudaStream_t stream = queue.queue_as<util::cuda_stream>();
+  func.eval_vxc_fxc_device( npts, rho, gamma, lapl, tau, vrho, vgamma, vlapl, vtau,
+    v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, v2gammalapl, v2gammatau,
+    v2lapl2, v2lapltau, v2tau2, stream );
+}
+
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt
index 3c6bddbd..a7b14ce4 100644
--- a/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx
index 5a1d4c78..b5d6f499 100644
--- a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx
+++ b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp
index a6551d5c..00da2e16 100644
--- a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx
index 3fb77bcc..c80c9a61 100644
--- a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx
+++ b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp
index 3bcb5b57..70008f8d 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp
index 5a5e78a5..987a13df 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp
index c7405df7..ae8c43e7 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp
index 47b8ef55..102fb8b8 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip
index dc2cffea..4af37bbd 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp
index c8020819..fa24862b 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp
index 527eda7b..cf14c269 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip
index 953bc00f..f830596c 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp
index e2a3d579..efbb9ad3 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp
index 7ba02d5a..2d3e537c 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip
index 953bbc34..1e9044a7 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip
index 67af3a41..4c6d5874 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp
index ed31ed12..66e91b8a 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip
index d4f6eda9..385e0160 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp
index 09371726..788f94d1 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip
index 3aa45dee..8848ed38 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip
index a2a69d24..d415139b 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip
index 946097cb..c418d0a5 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
index 8e93d33f..0d8f2d04 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip
index d188e9e6..673d5a5f 100644
--- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip
+++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx
index da8544ce..dccc9bf9 100644
--- a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx
+++ b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx
index 2a83e76c..89626b46 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -48,16 +52,37 @@ void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, density_id den, boo
   pimpl_->NAME(device_data, den, b);                               \
 }
 
+#define FWD_TO_PIMPL_BOOL_DEN_ID(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, bool b, density_id den ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, b, den);                               \
+}
+
 #define FWD_TO_PIMPL_KS_SCHEME(NAME) \
 void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track ) { \
   throw_if_invalid_pimpl(pimpl_);                               \
   pimpl_->NAME(device_data, track);                               \
 }
+#define FWD_TO_PIMPL_KS_SCHEME_BOOL(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, bool b ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, track, b);                               \
+}
+#define FWD_TO_PIMPL_KS_SCHEME_BOOL_BOOL(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, bool b1, bool b2 ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, track, b1, b2);                               \
+}
 #define FWD_TO_PIMPL_KS_SCHEME_DEN_ID(NAME) \
 void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, density_id den ) { \
   throw_if_invalid_pimpl(pimpl_);                               \
   pimpl_->NAME(device_data, track, den);                               \
 }
+#define FWD_TO_PIMPL_KS_SCHEME_BOOL_DEN_ID(NAME) \
+void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, bool b, density_id den ) { \
+  throw_if_invalid_pimpl(pimpl_);                               \
+  pimpl_->NAME(device_data, track, b, den);                               \
+}
 
 FWD_TO_PIMPL(partition_weights)         // Partition weights
 
@@ -65,31 +90,50 @@ FWD_TO_PIMPL(eval_collocation)          // Collocation
 FWD_TO_PIMPL(eval_collocation_gradient) // Collocation Gradient
 FWD_TO_PIMPL(eval_collocation_hessian)  // Collocation Hessian
 FWD_TO_PIMPL(eval_collocation_laplacian)  // Collocation Laplacian
+FWD_TO_PIMPL(eval_collocation_lapgrad)  // Collocation Laplacian gradient
 
 
 FWD_TO_PIMPL_KS_SCHEME(eval_uvars_lda)            // U variables LDA (rho)
 FWD_TO_PIMPL_KS_SCHEME(eval_uvars_gga)            // U variables GGA (gamma)
-FWD_TO_PIMPL_BOOL(eval_uvars_mgga)                // U variables MGGA (tau, lapl)
-FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvar)               // V variable (density + grad)
+FWD_TO_PIMPL_KS_SCHEME_BOOL(eval_uvars_mgga)      // U variables MGGA (tau, lapl)
+FWD_TO_PIMPL_DEN_ID(eval_vvars_lda)               // V variables LDA (density)
+FWD_TO_PIMPL_DEN_ID(eval_vvars_gga)               // V variables GGA (density + grad)
+FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvars_mgga)         // V variables MGGA (density + grad + tau + lapl)
+
+FWD_TO_PIMPL_KS_SCHEME(eval_tmat_lda)            // T variables LDA (trho)
+FWD_TO_PIMPL_KS_SCHEME(eval_tmat_gga)            // T variables GGA (tgamma)
+FWD_TO_PIMPL_KS_SCHEME_BOOL(eval_tmat_mgga)      // T variables MGGA (ttau, tlapl)
+FWD_TO_PIMPL_DEN_ID(eval_vvars_lda_trial)               // V variables LDA (trial density)
+FWD_TO_PIMPL_DEN_ID(eval_vvars_gga_trial)               // V variables GGA (trial density + grad)
+FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvars_mgga_trial)         // V variables MGGA (trial density + grad + tau + lapl)
 
 FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_lda_vxc)         // Eval Z Matrix LDA VXC
 FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_gga_vxc)         // Eval Z Matrix GGA VXC
-FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc)                    // Eval Z Matrix mGGA VXC
-FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc)                    // Eval M Matrix mGGA VXC
+FWD_TO_PIMPL_KS_SCHEME_BOOL_DEN_ID(eval_zmat_mgga_vxc)   // Eval Z Matrix mGGA VXC
+FWD_TO_PIMPL_KS_SCHEME_BOOL_DEN_ID(eval_mmat_mgga_vxc)   // Eval M Matrix mGGA VXC
+
+FWD_TO_PIMPL_DEN_ID(eval_zmat_lda_fxc)         // Eval Z Matrix LDA FXC
+FWD_TO_PIMPL_DEN_ID(eval_zmat_gga_fxc)         // Eval Z Matrix GGA FXC
+FWD_TO_PIMPL_BOOL_DEN_ID(eval_zmat_mgga_fxc)   // Eval Z Matrix mGGA FXC
+FWD_TO_PIMPL_BOOL_DEN_ID(eval_mmat_mgga_fxc)   // Eval M Matrix mGGA FXC
+
 
 FWD_TO_PIMPL(eval_exx_fmat)             // Eval EXX F Matrix
-//FWD_TO_PIMPL(eval_exx_gmat)             // Eval EXX G Matrix
+//FWD_TO_PIMPL(eval_exx_gmat)           // Eval EXX G Matrix
 
 
 FWD_TO_PIMPL(inc_exc)
 FWD_TO_PIMPL(inc_nel)
 FWD_TO_PIMPL_DEN_ID_BOOL(inc_vxc)            // Increment VXC_I by Z
+FWD_TO_PIMPL_DEN_ID_BOOL(inc_fxc)            // Increment FXC_I by Z
 
 FWD_TO_PIMPL(inc_exx_k)     
-FWD_TO_PIMPL(inc_exc_grad_lda)
-FWD_TO_PIMPL(inc_exc_grad_gga)
+FWD_TO_PIMPL_KS_SCHEME_BOOL(inc_exc_grad_lda)
+FWD_TO_PIMPL_KS_SCHEME_BOOL(inc_exc_grad_gga)
+FWD_TO_PIMPL_KS_SCHEME_BOOL_BOOL(inc_exc_grad_mgga)
 
 FWD_TO_PIMPL_DEN_ID(symmetrize_vxc)
+FWD_TO_PIMPL_DEN_ID(symmetrize_fxc) // Added FXC function
 FWD_TO_PIMPL(symmetrize_exx_k)
 FWD_TO_PIMPL(eval_exx_ek_screening_bfn_stats)
 
@@ -100,7 +144,14 @@ void LocalDeviceWorkDriver::eval_xmat( double fac, XCDeviceData* device_data, bo
   throw_if_invalid_pimpl(pimpl_);
   pimpl_->eval_xmat(fac, device_data, do_grad, den);
 }
-
+void LocalDeviceWorkDriver::save_xmat( XCDeviceData* device_data, bool do_grad, density_id den ) {
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->save_xmat(device_data, do_grad, den);
+}
+void LocalDeviceWorkDriver::eval_xmat_trial( double fac, XCDeviceData* device_data, bool do_grad, density_id den ) {
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_xmat_trial(fac, device_data, do_grad, den);
+}
 
 void LocalDeviceWorkDriver::eval_exx_gmat( XCDeviceData* device_data, 
   const BasisSetMap& basis_map) {
@@ -126,6 +177,23 @@ void LocalDeviceWorkDriver::eval_kern_exc_vxc_mgga( const functional_type& func,
   pimpl_->eval_kern_exc_vxc_mgga(func,data);
 }
 
+void LocalDeviceWorkDriver::eval_kern_vxc_fxc_lda( const functional_type& func,
+  XCDeviceData* data) {
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_kern_vxc_fxc_lda(func,data);
+}
+
+void LocalDeviceWorkDriver::eval_kern_vxc_fxc_gga( const functional_type& func,
+  XCDeviceData* data) {
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_kern_vxc_fxc_gga(func,data);
+}
+
+void LocalDeviceWorkDriver::eval_kern_vxc_fxc_mgga( const functional_type& func,
+  XCDeviceData* data) {
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_kern_vxc_fxc_mgga(func,data);
+}
 
 std::unique_ptr<XCDeviceData> LocalDeviceWorkDriver::create_device_data(const DeviceRuntimeEnvironment& rt) {
   throw_if_invalid_pimpl(pimpl_);
@@ -139,4 +207,9 @@ void LocalDeviceWorkDriver::exx_ek_shellpair_collision( double eps_E, double eps
   pimpl_->exx_ek_shellpair_collision( eps_E, eps_K, device_data, tb, te, shpairs );
 }
 
+void LocalDeviceWorkDriver::eval_weight_1st_deriv_contracted( XCDeviceData* device_data, XCWeightAlg alg ) {
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_weight_1st_deriv_contracted(device_data, alg);
+}
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp
index 604f0739..8c65c075 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -56,28 +60,49 @@ class LocalDeviceWorkDriver : public LocalWorkDriver {
   // Public APIs
 
   void partition_weights( XCDeviceData* );
+  void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg);
 
   void eval_collocation( XCDeviceData* );
   void eval_collocation_gradient( XCDeviceData* );
   void eval_collocation_hessian( XCDeviceData* );
   void eval_collocation_laplacian( XCDeviceData* );
+  void eval_collocation_lapgrad( XCDeviceData* );
   void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den );
+  void eval_xmat_trial( double fac, XCDeviceData*, bool do_grad, density_id den );
+  void save_xmat( XCDeviceData*, bool grad, density_id den );
   
-  void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) ;
-  void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) ;
-  void eval_uvars_mgga( XCDeviceData*, bool ) ;
-  void eval_vvar( XCDeviceData*, density_id, bool ) ;
+  void eval_uvars_lda ( XCDeviceData*, integrator_ks_scheme ) ;
+  void eval_uvars_gga ( XCDeviceData*, integrator_ks_scheme ) ;
+  void eval_uvars_mgga( XCDeviceData*, integrator_ks_scheme, bool ) ;
+  void eval_vvars_lda ( XCDeviceData*, density_id ) ;
+  void eval_vvars_gga ( XCDeviceData*, density_id ) ;
+  void eval_vvars_mgga( XCDeviceData*, density_id, bool ) ;
+
+  void eval_tmat_lda ( XCDeviceData*, integrator_ks_scheme ) ;
+  void eval_tmat_gga ( XCDeviceData*, integrator_ks_scheme ) ;
+  void eval_tmat_mgga( XCDeviceData*, integrator_ks_scheme, bool ) ;
+  void eval_vvars_lda_trial ( XCDeviceData*, density_id ) ;
+  void eval_vvars_gga_trial ( XCDeviceData*, density_id ) ;
+  void eval_vvars_mgga_trial( XCDeviceData*, density_id, bool ) ;
 
 
   void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* );
   void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* );
   void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* );
 
+  void eval_kern_vxc_fxc_lda( const functional_type&, XCDeviceData* );
+  void eval_kern_vxc_fxc_gga( const functional_type&, XCDeviceData* );
+  void eval_kern_vxc_fxc_mgga( const functional_type&, XCDeviceData* );
 
   void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ;
   void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ;
-  void eval_zmat_mgga_vxc( XCDeviceData*, bool ) ;
-  void eval_mmat_mgga_vxc( XCDeviceData*, bool );
+  void eval_zmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) ;
+  void eval_mmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id );
+
+  void eval_zmat_lda_fxc( XCDeviceData*, density_id ) ;
+  void eval_zmat_gga_fxc( XCDeviceData*, density_id ) ;
+  void eval_zmat_mgga_fxc( XCDeviceData*, bool, density_id ) ;
+  void eval_mmat_mgga_fxc( XCDeviceData*, bool, density_id );
 
   void eval_exx_fmat( XCDeviceData* );
   void eval_exx_gmat( XCDeviceData*, const BasisSetMap& );
@@ -85,8 +110,10 @@ class LocalDeviceWorkDriver : public LocalWorkDriver {
   void inc_exc( XCDeviceData* );
   void inc_nel( XCDeviceData* );
   void inc_vxc( XCDeviceData*, density_id, bool do_m = false );
-  void inc_exc_grad_lda( XCDeviceData* );
-  void inc_exc_grad_gga( XCDeviceData* );
+  void inc_fxc( XCDeviceData*, density_id, bool do_m = false );
+  void inc_exc_grad_lda( XCDeviceData*, integrator_ks_scheme, bool );
+  void inc_exc_grad_gga( XCDeviceData*, integrator_ks_scheme, bool );
+  void inc_exc_grad_mgga( XCDeviceData*, integrator_ks_scheme , bool, bool );
   void inc_exx_k( XCDeviceData* );
 
   void eval_exx_ek_screening_bfn_stats( XCDeviceData* );
@@ -94,6 +121,7 @@ class LocalDeviceWorkDriver : public LocalWorkDriver {
     host_task_iterator, host_task_iterator, const ShellPairCollection<double>& );
 
   void symmetrize_vxc( XCDeviceData*, density_id );
+  void symmetrize_fxc( XCDeviceData*, density_id );
   void symmetrize_exx_k( XCDeviceData* );
 
   std::unique_ptr<XCDeviceData> create_device_data(const DeviceRuntimeEnvironment&);
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx
index dc5c0e04..26620277 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp
index f43dd12c..f7178a8f 100644
--- a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp
+++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -30,34 +34,58 @@ struct LocalDeviceWorkDriverPIMPL {
   // Public APIs
 
   virtual void partition_weights( XCDeviceData* ) = 0;
+  virtual void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg ) = 0;
   virtual void eval_collocation( XCDeviceData* ) = 0;
   virtual void eval_collocation_gradient( XCDeviceData* ) = 0;
   virtual void eval_collocation_hessian( XCDeviceData* ) = 0;
   virtual void eval_collocation_laplacian( XCDeviceData* ) = 0;
+  virtual void eval_collocation_lapgrad( XCDeviceData* ) = 0;
   virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ) = 0;
+  virtual void save_xmat( XCDeviceData*, bool do_grad, density_id den ) = 0;
   virtual void eval_exx_fmat( XCDeviceData* ) = 0;
   //virtual void eval_exx_gmat( XCDeviceData* ) = 0;
   virtual void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) = 0;
   virtual void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) = 0;
   virtual void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) = 0;
-  virtual void eval_uvars_mgga( XCDeviceData*, bool ) = 0;
-  virtual void eval_vvar( XCDeviceData*,  density_id, bool ) = 0;
+  virtual void eval_uvars_mgga( XCDeviceData*, integrator_ks_scheme, bool ) = 0;
+  virtual void eval_vvars_lda ( XCDeviceData*, density_id ) = 0;
+  virtual void eval_vvars_gga ( XCDeviceData*, density_id ) = 0;
+  virtual void eval_vvars_mgga( XCDeviceData*, density_id, bool ) = 0;
   virtual void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) = 0;
   virtual void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) = 0;
   virtual void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ) = 0;
+  virtual void eval_kern_vxc_fxc_lda( const functional_type&, XCDeviceData* ) = 0;
+  virtual void eval_kern_vxc_fxc_gga( const functional_type&, XCDeviceData* ) = 0;
+  virtual void eval_kern_vxc_fxc_mgga( const functional_type&, XCDeviceData* ) = 0;
   virtual void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0;
   virtual void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0;
-  virtual void eval_zmat_mgga_vxc( XCDeviceData*, bool ) = 0;
-  virtual void eval_mmat_mgga_vxc( XCDeviceData*, bool ) = 0;
+  virtual void eval_zmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) = 0;
+  virtual void eval_mmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) = 0;
+  virtual void eval_zmat_lda_fxc( XCDeviceData*, density_id ) = 0;
+  virtual void eval_zmat_gga_fxc( XCDeviceData*, density_id ) = 0;
+  virtual void eval_zmat_mgga_fxc( XCDeviceData*, bool, density_id ) = 0;
+  virtual void eval_mmat_mgga_fxc( XCDeviceData*, bool, density_id ) = 0;
   virtual void inc_exc( XCDeviceData* ) = 0;
   virtual void inc_nel( XCDeviceData* ) = 0;
   virtual void inc_vxc( XCDeviceData* , density_id, bool) = 0;
-  virtual void inc_exc_grad_lda( XCDeviceData* ) = 0;
-  virtual void inc_exc_grad_gga( XCDeviceData* ) = 0;
+  virtual void inc_fxc( XCDeviceData* , density_id, bool) = 0;  
+  virtual void inc_exc_grad_lda( XCDeviceData*, integrator_ks_scheme, bool  ) = 0;
+  virtual void inc_exc_grad_gga( XCDeviceData*, integrator_ks_scheme, bool  ) = 0;
+  virtual void inc_exc_grad_mgga( XCDeviceData*, integrator_ks_scheme , bool, bool ) = 0;
   virtual void inc_exx_k( XCDeviceData* ) = 0;
   virtual void symmetrize_vxc( XCDeviceData*, density_id ) = 0;
+  virtual void symmetrize_fxc( XCDeviceData*, density_id ) = 0;
   virtual void symmetrize_exx_k( XCDeviceData* ) = 0;
 
+  //second derivative
+  virtual void eval_xmat_trial( double fac, XCDeviceData*, bool do_grad, density_id den ) = 0;
+  virtual void eval_tmat_lda( XCDeviceData*, integrator_ks_scheme ) = 0;
+  virtual void eval_tmat_gga( XCDeviceData*, integrator_ks_scheme ) = 0;
+  virtual void eval_tmat_mgga( XCDeviceData*, integrator_ks_scheme, bool ) = 0;
+  virtual void eval_vvars_lda_trial ( XCDeviceData*, density_id ) = 0;
+  virtual void eval_vvars_gga_trial ( XCDeviceData*, density_id ) = 0;
+  virtual void eval_vvars_mgga_trial( XCDeviceData*, density_id, bool ) = 0;
+
   virtual void eval_exx_ek_screening_bfn_stats( XCDeviceData* ) = 0;
   virtual void exx_ek_shellpair_collision( double eps_E, double eps_K, 
     XCDeviceData*, host_task_iterator, host_task_iterator, 
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
index 5ffa8443..d2801307 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
@@ -1,12 +1,17 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #include "scheme1_base.hpp"
 #include "device/common/zmat_vxc.hpp"
+#include "device/common/zmat_fxc.hpp"
 #include "device/common/collocation_device.hpp"
 #include "device/common/device_blas.hpp"
 #include "device/common/xc_functional_eval_wrapper.hpp"
@@ -46,6 +51,7 @@ namespace XGPU {
     cudaStream_t stream);
 
   void integral_1_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpairs, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -60,6 +66,7 @@ namespace XGPU {
     cudaStream_t stream);
 
   void integral_2_task_batched(
+    bool sph,
     size_t ntasks, size_t nsubtask,
     int max_primpairs, size_t max_nsp,
     GauXC::XCDeviceTask*                device_tasks,
@@ -98,6 +105,7 @@ namespace XGPU {
         cudaStream_t stream); 
 
   void integral_1_1_task_batched(
+        bool sph,
         size_t ntasks,
         size_t nsubtasks,
         int max_primpairs, size_t max_nsp,
@@ -121,6 +129,7 @@ namespace XGPU {
         cudaStream_t stream); 
 
   void integral_2_2_task_batched(
+        bool sph,
         size_t ntasks,
         size_t nsubtasks,
         int max_primpairs, size_t max_nsp,
@@ -145,6 +154,7 @@ namespace XGPU {
         
   void integral_1_0_task_batched(
         bool swap,
+        bool sph,
         size_t ntasks,
         size_t nsubtasks,
         int max_primpairs, size_t max_nsp,
@@ -170,6 +180,7 @@ namespace XGPU {
 
   void integral_2_0_task_batched(
         bool swap,
+        bool sph,
         size_t ntasks,
         size_t nsubtasks,
         int max_primpairs, size_t max_nsp,
@@ -195,6 +206,7 @@ namespace XGPU {
 
   void integral_2_1_task_batched(
         bool swap,
+        bool sph_2, bool sph_1,
         size_t ntasks,
         size_t nsubtasks,
         int max_primpairs, size_t max_nsp,
@@ -279,7 +291,7 @@ void AoSScheme1Base::eval_zmat_gga_vxc( XCDeviceData* _data, integrator_ks_schem
   data->device_backend_->check_error("zmat_gga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){
+void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl, density_id id){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -296,14 +308,80 @@ void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){
 
   auto aos_stack     = data->aos_stack;
   zmat_mgga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks,
-    do_lapl, data->device_backend_->queue() );
+    do_lapl, scheme, id, data->device_backend_->queue() );
 
 
   data->device_backend_->check_error("zmat_mgga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
+void AoSScheme1Base::eval_zmat_lda_fxc( XCDeviceData* _data, density_id den ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  auto aos_stack     = data->aos_stack;
+  zmat_lda_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, den,
+    data->device_backend_->queue() );
+
+  data->device_backend_->check_error("zmat_lda_fxc" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+void AoSScheme1Base::eval_zmat_gga_fxc( XCDeviceData* _data, density_id den ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  auto aos_stack     = data->aos_stack;
+  zmat_gga_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, den,
+    data->device_backend_->queue() );
+
+  data->device_backend_->check_error("zmat_gga_fxc" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+void AoSScheme1Base::eval_zmat_mgga_fxc( XCDeviceData* _data, bool do_lapl, density_id id){
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  auto aos_stack     = data->aos_stack;
+  zmat_mgga_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks,
+    do_lapl, id, data->device_backend_->queue() );
+
 
-void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){
+  data->device_backend_->check_error("zmat_mgga_fxc" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl, density_id id){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -320,12 +398,35 @@ void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){
 
   auto aos_stack     = data->aos_stack;
   mmat_mgga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks,
-    do_lapl, data->device_backend_->queue() );
+    do_lapl, scheme, id, data->device_backend_->queue() );
 
 
   data->device_backend_->check_error("mmat_mgga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
+void AoSScheme1Base::eval_mmat_mgga_fxc( XCDeviceData* _data, bool do_lapl, density_id id){
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  auto aos_stack     = data->aos_stack;
+  mmat_mgga_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks,
+    do_lapl, id, data->device_backend_->queue() );
+
+
+  data->device_backend_->check_error("mmat_mgga_fxc" __FILE__ ": " + std::to_string(__LINE__));
+}
+
 void AoSScheme1Base::eval_collocation( XCDeviceData* _data ) {
 
   auto* data = dynamic_cast<Data*>(_data);
@@ -387,7 +488,7 @@ void AoSScheme1Base::eval_collocation_gradient( XCDeviceData* _data ) {
     data->device_backend_->queue() );
 #endif
   
-  data->device_backend_->check_error("collocation grad" __FILE__ ": " + std::to_string(__LINE__));
+  data->device_backend_->check_error("collocation grad " __FILE__ ": " + std::to_string(__LINE__));
 }
 
 void AoSScheme1Base::eval_collocation_hessian( XCDeviceData* _data ) {
@@ -430,6 +531,26 @@ void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) {
   data->device_backend_->check_error("collocation lapl" __FILE__ ": " + std::to_string(__LINE__));
 }
 
+void AoSScheme1Base::eval_collocation_lapgrad( XCDeviceData* _data ) {
+#ifdef GAUXC_HAS_HIP
+  GAUXC_GENERIC_EXCEPTION("Laplacian Gradient NYI for HIP Backends");
+#else
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto aos_stack     = data->aos_stack;
+
+  auto max_l = data->l_batched_shell_to_task.size() - 1;
+  eval_collocation_shell_to_task_lapgrad( max_l, 
+    data->l_batched_shell_to_task.data(), aos_stack.device_tasks,
+    data->device_backend_->queue() );
+#endif
+  
+  data->device_backend_->check_error("collocation lap grad " __FILE__ ": " + std::to_string(__LINE__));
+}
+
 
 
 
@@ -537,7 +658,7 @@ void AoSScheme1Base::eval_uvars_gga( XCDeviceData* _data, integrator_ks_scheme k
   data->device_backend_->check_error("uvvar gga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){
+void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -548,29 +669,21 @@ void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){
   const auto ntasks = tasks.size();
   size_t nbe_max = 0, npts_max = 0;
   for( auto& task : tasks ) {
-    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
     npts_max = std::max( npts_max, task.npts );
   }
 
-  // Zero tau
   auto base_stack    = data->base_stack;
-  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.tau_eval_device,   "Tau Zero" );
-  if(do_lapl) {
-    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_lapl_eval_device, "DenLapl Zero" );
-  }
-
-
-  // Evaluate U variables
+  
+  // Evaluate U variable
   auto aos_stack     = data->aos_stack;
-  GauXC::eval_uvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, do_lapl,
+  GauXC::eval_uvars_mgga( ntasks, npts_max, scheme, do_lapl,
     aos_stack.device_tasks, data->device_backend_->queue() );
 
   
   data->device_backend_->check_error("uvvar mgga" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-
-void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool do_grad){
+void AoSScheme1Base::eval_vvars_lda( XCDeviceData* _data, density_id den_select){
   auto* data = dynamic_cast<Data*>(_data);
   if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
@@ -587,254 +700,908 @@ void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool
   // Zero density
   auto base_stack    = data->base_stack;
   double* den_eval_ptr    = nullptr;
-  double* den_x_eval_ptr  = nullptr;
-  double* den_y_eval_ptr  = nullptr;
-  double* den_z_eval_ptr  = nullptr;
   switch ( den_select ) {
     case DEN_S:
       den_eval_ptr = base_stack.den_s_eval_device;
-      if (do_grad) { den_x_eval_ptr = base_stack.dden_sx_eval_device;
-                     den_y_eval_ptr = base_stack.dden_sy_eval_device;
-                     den_z_eval_ptr = base_stack.dden_sz_eval_device; }
       break;
     case DEN_Z:
       den_eval_ptr = base_stack.den_z_eval_device;
-      if (do_grad) { den_x_eval_ptr = base_stack.dden_zx_eval_device;
-                     den_y_eval_ptr = base_stack.dden_zy_eval_device;
-                     den_z_eval_ptr = base_stack.dden_zz_eval_device; }
       break;
     case DEN_Y:
       den_eval_ptr = base_stack.den_y_eval_device;
-      if (do_grad) { den_x_eval_ptr = base_stack.dden_yx_eval_device;
-                     den_y_eval_ptr = base_stack.dden_yy_eval_device;
-                     den_z_eval_ptr = base_stack.dden_yz_eval_device; }
       break;
     case DEN_X:
       den_eval_ptr = base_stack.den_x_eval_device;
-      if (do_grad) { den_x_eval_ptr = base_stack.dden_xx_eval_device;
-                     den_y_eval_ptr = base_stack.dden_xy_eval_device;
-                     den_z_eval_ptr = base_stack.dden_xz_eval_device; }
       break;
     default:
-      GAUXC_GENERIC_EXCEPTION( "eval_vvar called with invalid density selected!" );
+      GAUXC_GENERIC_EXCEPTION( "eval_vvars_lda called with invalid density selected!" );
   }
 
   data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
 
-  if (do_grad) {
-    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" );
-    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" );
-    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" );
-  }
-  
   // Evaluate V variable
   auto aos_stack     = data->aos_stack;
-  GauXC::eval_vvar( ntasks, nbe_max, npts_max, do_grad, den_select,
+  GauXC::eval_vvars_lda( ntasks, nbe_max, npts_max, den_select,
     aos_stack.device_tasks, data->device_backend_->queue() );
 
 }
 
-
-void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, 
-  XCDeviceData* _data ) {
-
+void AoSScheme1Base::eval_vvars_gga( XCDeviceData* _data, density_id den_select){
   auto* data = dynamic_cast<Data*>(_data);
-  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!");
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
 
+  // Zero density
   auto base_stack    = data->base_stack;
+  double* den_eval_ptr    = nullptr;
+  double* den_x_eval_ptr  = nullptr;
+  double* den_y_eval_ptr  = nullptr;
+  double* den_z_eval_ptr  = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      den_eval_ptr = base_stack.den_s_eval_device;
+      den_x_eval_ptr = base_stack.dden_sx_eval_device;
+      den_y_eval_ptr = base_stack.dden_sy_eval_device;
+      den_z_eval_ptr = base_stack.dden_sz_eval_device; 
+      break;
+    case DEN_Z:
+      den_eval_ptr = base_stack.den_z_eval_device;
+      den_x_eval_ptr = base_stack.dden_zx_eval_device;
+      den_y_eval_ptr = base_stack.dden_zy_eval_device;
+      den_z_eval_ptr = base_stack.dden_zz_eval_device;
+      break;
+    case DEN_Y:
+      den_eval_ptr = base_stack.den_y_eval_device;
+      den_x_eval_ptr = base_stack.dden_yx_eval_device;
+      den_y_eval_ptr = base_stack.dden_yy_eval_device;
+      den_z_eval_ptr = base_stack.dden_yz_eval_device; 
+      break;
+    case DEN_X:
+      den_eval_ptr = base_stack.den_x_eval_device;
+      den_x_eval_ptr = base_stack.dden_xx_eval_device;
+      den_y_eval_ptr = base_stack.dden_xy_eval_device;
+      den_z_eval_ptr = base_stack.dden_xz_eval_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga called with invalid density selected!" );
+  }
 
-  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
-  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
-  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
-  const bool is_pol = is_UKS or is_GKS;
-  const bool is_excgrad = data->allocated_terms.exc_grad;
-
-  const size_t npts = data->total_npts_task_batch ;
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" );
   
-  auto* dep = base_stack.den_s_eval_device;
+  // Evaluate V variable
+  auto aos_stack = data->aos_stack;
+  GauXC::eval_vvars_gga( ntasks, nbe_max, npts_max, den_select,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
-  if ( is_pol ) {
-    dep = base_stack.den_eval_device;
-    // Interleave pos/neg densities before passing it to ExchCXX
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device  , 2, "den_s -> den_eval" );
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" );
-  }
+}
 
-  GauXC::eval_kern_exc_vxc_lda( func, npts,
-    dep, base_stack.eps_eval_device, 
-    base_stack.vrho_eval_device, data->device_backend_->queue() );
+void AoSScheme1Base::eval_vvars_mgga( XCDeviceData* _data, density_id den_select, bool need_lapl){
+  auto* data = dynamic_cast<Data*>(_data);
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                  base_stack.weights_device, 1, base_stack.eps_eval_device, 1 );
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  if( not is_pol ) {
-    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 );
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
   }
-  else if( is_pol ) {
-    // De-interleave pos/neg densities
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.vrho_eval_device  , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" );
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" );
-
-    // Weight results point-by-point
-    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                    base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 );
-    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                    base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 );
+
+  // Zero density
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr    = nullptr;
+  double* den_x_eval_ptr  = nullptr;
+  double* den_y_eval_ptr  = nullptr;
+  double* den_z_eval_ptr  = nullptr;
+  double* tau_eval_ptr    = nullptr;
+  double* lapl_eval_ptr   = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      den_eval_ptr = base_stack.den_s_eval_device;
+      den_x_eval_ptr = base_stack.dden_sx_eval_device;
+      den_y_eval_ptr = base_stack.dden_sy_eval_device;
+      den_z_eval_ptr = base_stack.dden_sz_eval_device; 
+      tau_eval_ptr   = base_stack.tau_s_eval_device;
+      lapl_eval_ptr  = base_stack.lapl_s_eval_device;
+      break;
+    case DEN_Z:
+      den_eval_ptr = base_stack.den_z_eval_device;
+      den_x_eval_ptr = base_stack.dden_zx_eval_device;
+      den_y_eval_ptr = base_stack.dden_zy_eval_device;
+      den_z_eval_ptr = base_stack.dden_zz_eval_device;
+      tau_eval_ptr   = base_stack.tau_z_eval_device;
+      lapl_eval_ptr  = base_stack.lapl_z_eval_device;
+      break;
+    case DEN_Y:
+      den_eval_ptr = base_stack.den_y_eval_device;
+      den_x_eval_ptr = base_stack.dden_yx_eval_device;
+      den_y_eval_ptr = base_stack.dden_yy_eval_device;
+      den_z_eval_ptr = base_stack.dden_yz_eval_device; 
+      break;
+    case DEN_X:
+      den_eval_ptr = base_stack.den_x_eval_device;
+      den_x_eval_ptr = base_stack.dden_xx_eval_device;
+      den_y_eval_ptr = base_stack.dden_xy_eval_device;
+      den_z_eval_ptr = base_stack.dden_xz_eval_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga called with invalid density selected!" );
   }
+
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" );
+  if(tau_eval_ptr)
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, tau_eval_ptr, "TAU Zero");
+  if(lapl_eval_ptr)
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, lapl_eval_ptr, "LAPL Zero");
   
-  data->device_backend_->check_error("exc_vxc lda" __FILE__ ": " + std::to_string(__LINE__));
-}
+  // Evaluate V variable
+  auto aos_stack = data->aos_stack;
+  GauXC::eval_vvars_mgga( ntasks, nbe_max, npts_max, den_select, need_lapl,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
+}
 
-void AoSScheme1Base::eval_kern_exc_vxc_gga( const functional_type& func, 
-  XCDeviceData* _data ) {
 
+void AoSScheme1Base::eval_tmat_lda( XCDeviceData* _data, integrator_ks_scheme ks_scheme){
   auto* data = dynamic_cast<Data*>(_data);
-  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!");
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    npts_max = std::max( npts_max, task.npts );
+  }
 
   auto base_stack    = data->base_stack;
-  double* den_eval_ptr = base_stack.den_s_eval_device;
   
-  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
-  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
-  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
-  const bool is_pol  = is_UKS or is_GKS;
-  const bool is_excgrad = data->allocated_terms.exc_grad;
+  // Evaluate U variables
+  auto aos_stack     = data->aos_stack;
+  GauXC::eval_tmat_lda( ntasks, npts_max, ks_scheme,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
-  const size_t npts = data->total_npts_task_batch ;
-  
   
+  data->device_backend_->check_error("uvvar lda trial" __FILE__ ": " + std::to_string(__LINE__));
+}
 
-  if ( is_pol ) {
-    den_eval_ptr = base_stack.den_eval_device;
-    // Interleave pos/neg densities before passing it to ExchCXX
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device  , 2, "den_s -> den_eval" );
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" );
-    // Interleave gamma pp, pm, mm
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.gamma_pp_eval_device, 1, base_stack.gamma_eval_device  , 3, "gamma_pp -> gamma_eval");
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.gamma_pm_eval_device, 1, base_stack.gamma_eval_device+1, 3, "gamma_pm -> gamma_eval");
-    data->device_backend_->
-      copy_async_2d( 1, npts, base_stack.gamma_mm_eval_device, 1, base_stack.gamma_eval_device+2, 3, "gamma_mm -> gamma_eval");
-  }
-
-  GauXC::eval_kern_exc_vxc_gga( func, data->total_npts_task_batch, 
-    den_eval_ptr, base_stack.gamma_eval_device, 
-    base_stack.eps_eval_device, base_stack.vrho_eval_device, 
-    base_stack.vgamma_eval_device, data->device_backend_->queue() );
-
+void AoSScheme1Base::eval_tmat_gga( XCDeviceData* _data, integrator_ks_scheme ks_scheme){
+  auto* data = dynamic_cast<Data*>(_data);
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.eps_eval_device, 1 );
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  if( not is_pol ) {
-    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 );
-    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vgamma_eval_device, 1 );
-  }
-  else if( is_pol ) {
-      // De-interleave pos/neg densities
-      data->device_backend_->
-        copy_async_2d( 1, npts, base_stack.vrho_eval_device  , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" );
-      data->device_backend_->
-        copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" );
-
-      // Multiply by weights point-by-point
-      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                      base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 );
-      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                      base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 );
-
-      // De-interleave vgamma
-      data->device_backend_->
-        copy_async_2d( 1, npts, base_stack.vgamma_eval_device  , 3, base_stack.vgamma_pp_eval_device, 1, "vgamma_eval -> vgamma_pp" );
-      data->device_backend_->
-        copy_async_2d( 1, npts, base_stack.vgamma_eval_device+1, 3, base_stack.vgamma_pm_eval_device, 1, "vgamma_eval -> vgamma_pm" );
-      data->device_backend_->
-        copy_async_2d( 1, npts, base_stack.vgamma_eval_device+2, 3, base_stack.vgamma_mm_eval_device, 1, "vgamma_eval -> vgamma_mm" );
-
-      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                      base_stack.weights_device, 1, base_stack.vgamma_pp_eval_device, 1 );
-      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                      base_stack.weights_device, 1, base_stack.vgamma_pm_eval_device, 1 );
-      hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1,
-                      base_stack.weights_device, 1, base_stack.vgamma_mm_eval_device, 1 );
-      
-      
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    npts_max = std::max( npts_max, task.npts );
   }
 
+  auto base_stack    = data->base_stack;
+  
+  // Evaluate U variable
+  auto aos_stack     = data->aos_stack;
+  GauXC::eval_tmat_gga( ntasks, npts_max, ks_scheme,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
   
-  data->device_backend_->check_error("exc_vxc gga" __FILE__ ": " + std::to_string(__LINE__));
+  data->device_backend_->check_error("uvvar gga trial" __FILE__ ": " + std::to_string(__LINE__));
 }
 
-
-void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func, 
-  XCDeviceData* _data ) {
+void AoSScheme1Base::eval_tmat_mgga( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  if( !func.is_mgga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!");
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    npts_max = std::max( npts_max, task.npts );
+  }
 
   auto base_stack    = data->base_stack;
+  
+  // Evaluate U variable
+  auto aos_stack     = data->aos_stack;
+  GauXC::eval_tmat_mgga( ntasks, npts_max, scheme, do_lapl,
+    aos_stack.device_tasks, data->device_backend_->queue() );
 
-  if(func.needs_laplacian()) {
-    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.vlapl_eval_device, "VLapl Zero" );
-  }
+  
+  data->device_backend_->check_error("uvvar mgga trial" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+void AoSScheme1Base::eval_vvars_lda_trial( XCDeviceData* _data, density_id den_select){
+  auto* data = dynamic_cast<Data*>(_data);
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  // Zero density
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr    = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      den_eval_ptr = base_stack.tden_s_eval_device;
+      break;
+    case DEN_Z:
+      den_eval_ptr = base_stack.tden_z_eval_device;
+      break;
+    case DEN_Y:
+      den_eval_ptr = base_stack.tden_y_eval_device;
+      break;
+    case DEN_X:
+      den_eval_ptr = base_stack.tden_x_eval_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvars_lda_trial called with invalid density selected!" );
+  }
+
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
+
+  // Evaluate V variable
+  auto aos_stack     = data->aos_stack;
+  GauXC::eval_vvars_lda_trial( ntasks, nbe_max, npts_max, den_select,
+    aos_stack.device_tasks, data->device_backend_->queue() );
+
+}
+
+void AoSScheme1Base::eval_vvars_gga_trial( XCDeviceData* _data, density_id den_select){
+  auto* data = dynamic_cast<Data*>(_data);
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  // Zero density
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr    = nullptr;
+  double* den_x_eval_ptr  = nullptr;
+  double* den_y_eval_ptr  = nullptr;
+  double* den_z_eval_ptr  = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      den_eval_ptr = base_stack.tden_s_eval_device;
+      den_x_eval_ptr = base_stack.tdden_sx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_sy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_sz_eval_device; 
+      break;
+    case DEN_Z:
+      den_eval_ptr = base_stack.tden_z_eval_device;
+      den_x_eval_ptr = base_stack.tdden_zx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_zy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_zz_eval_device;
+      break;
+    case DEN_Y:
+      den_eval_ptr = base_stack.tden_y_eval_device;
+      den_x_eval_ptr = base_stack.tdden_yx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_yy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_yz_eval_device; 
+      break;
+    case DEN_X:
+      den_eval_ptr = base_stack.tden_x_eval_device;
+      den_x_eval_ptr = base_stack.tdden_xx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_xy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_xz_eval_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga_trial called with invalid density selected!" );
+  }
+
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" );
+  
+  // Evaluate V variable
+  auto aos_stack = data->aos_stack;
+  GauXC::eval_vvars_gga_trial( ntasks, nbe_max, npts_max, den_select,
+    aos_stack.device_tasks, data->device_backend_->queue() );
+
+}
+
+void AoSScheme1Base::eval_vvars_mgga_trial( XCDeviceData* _data, density_id den_select, bool need_lapl){
+  auto* data = dynamic_cast<Data*>(_data);
+  if ( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  auto& tasks = data->host_device_tasks;
+  const auto ntasks = tasks.size();
+  size_t nbe_max = 0, npts_max = 0;
+  for( auto& task : tasks ) {
+    nbe_max  = std::max( nbe_max, task.bfn_screening.nbe );
+    npts_max = std::max( npts_max, task.npts );
+  }
+
+  // Zero density
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr    = nullptr;
+  double* den_x_eval_ptr  = nullptr;
+  double* den_y_eval_ptr  = nullptr;
+  double* den_z_eval_ptr  = nullptr;
+  double* tau_eval_ptr    = nullptr;
+  double* lapl_eval_ptr   = nullptr;
+  switch ( den_select ) {
+    case DEN_S:
+      den_eval_ptr = base_stack.tden_s_eval_device;
+      den_x_eval_ptr = base_stack.tdden_sx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_sy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_sz_eval_device; 
+      tau_eval_ptr   = base_stack.ttau_s_eval_device;
+      lapl_eval_ptr  = base_stack.tlapl_s_eval_device;
+      break;
+    case DEN_Z:
+      den_eval_ptr = base_stack.tden_z_eval_device;
+      den_x_eval_ptr = base_stack.tdden_zx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_zy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_zz_eval_device;
+      tau_eval_ptr   = base_stack.ttau_z_eval_device;
+      lapl_eval_ptr  = base_stack.tlapl_z_eval_device;
+      break;
+    case DEN_Y:
+      den_eval_ptr = base_stack.tden_y_eval_device;
+      den_x_eval_ptr = base_stack.tdden_yx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_yy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_yz_eval_device; 
+      break;
+    case DEN_X:
+      den_eval_ptr = base_stack.tden_x_eval_device;
+      den_x_eval_ptr = base_stack.tdden_xx_eval_device;
+      den_y_eval_ptr = base_stack.tdden_xy_eval_device;
+      den_z_eval_ptr = base_stack.tdden_xz_eval_device;
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga_trial called with invalid density selected!" );
+  }
+
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" );
+  data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" );
+  if(tau_eval_ptr)
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, tau_eval_ptr, "TAU Zero");
+  if(lapl_eval_ptr)
+    data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, lapl_eval_ptr, "LAPL Zero");
+  
+  // Evaluate V variable
+  auto aos_stack = data->aos_stack;
+  GauXC::eval_vvars_mgga_trial( ntasks, nbe_max, npts_max, den_select, need_lapl,
+    aos_stack.device_tasks, data->device_backend_->queue() );
+
+}
+
+
+template <typename T>
+void interleave_kernel_input(size_t len, const T* src_data, int src_stride, T* tgt_data, int tgt_stride, std::string msg,
+                             DeviceBackend* backend) {
+  backend->copy_async_2d(1, len, src_data, src_stride, tgt_data, tgt_stride, msg);
+}
+
+template <typename T>
+void interleave_lda_input(size_t npts, T& base_stack, DeviceBackend* backend) {
+  interleave_kernel_input(npts, base_stack.den_s_eval_device, 1, base_stack.den_interleaved_device+0, 2,
+    "den_+ - > den_interleaved", backend);
+  interleave_kernel_input(npts, base_stack.den_z_eval_device, 1, base_stack.den_interleaved_device+1, 2,
+    "den_- - > den_interleaved", backend);
+}
+
+template <typename T>
+void interleave_gga_input(size_t npts, T& base_stack, DeviceBackend* backend) {
+  interleave_lda_input(npts, base_stack, backend);
+  interleave_kernel_input(npts, base_stack.gamma_pp_eval_device, 1, base_stack.gamma_eval_device+0, 3,
+    "gamma_++ - > gamma_interleaved", backend);
+  interleave_kernel_input(npts, base_stack.gamma_pm_eval_device, 1, base_stack.gamma_eval_device+1, 3,
+    "gamma_+- - > gamma_interleaved", backend);
+  interleave_kernel_input(npts, base_stack.gamma_mm_eval_device, 1, base_stack.gamma_eval_device+2, 3,
+    "gamma_-- - > gamma_interleaved", backend);
+}
+
+template <typename T>
+void interleave_mgga_input(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl) {
+  interleave_gga_input(npts, base_stack, backend);
+  interleave_kernel_input(npts, base_stack.tau_s_eval_device, 1, base_stack.tau_interleaved_device, 2,
+    "tau_+ - > tau_interleaved", backend);
+  interleave_kernel_input(npts, base_stack.tau_z_eval_device, 1, base_stack.tau_interleaved_device+1, 2,
+    "tau_- - > tau_interleaved", backend);
+  if(need_lapl) {
+    interleave_kernel_input(npts, base_stack.lapl_s_eval_device, 1, base_stack.lapl_interleaved_device, 2,
+      "lapl_+ - > lapl_interleaved", backend);
+    interleave_kernel_input(npts, base_stack.lapl_z_eval_device, 1, base_stack.lapl_interleaved_device+1, 2,
+      "lapl_- - > lapl_interleaved", backend);
+  }
+}
+ 
+
+
+template <typename T>
+void deinterleave_lda_output(size_t npts, T& base_stack, DeviceBackend* backend) {
+  interleave_kernel_input(npts, base_stack.vrho_eval_device+0, 2, base_stack.vrho_pos_eval_device, 1,
+    "vrho -> vrho+", backend);
+  interleave_kernel_input(npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1,
+    "vrho -> vrho-", backend);
+}
+
+template <typename T>
+void deinterleave_gga_output(size_t npts, T& base_stack, DeviceBackend* backend) {
+  deinterleave_lda_output(npts, base_stack, backend);
+  interleave_kernel_input(npts, base_stack.vgamma_eval_device+0, 3, base_stack.vgamma_pp_eval_device, 1,
+    "vgamma -> vgamma++", backend);
+  interleave_kernel_input(npts, base_stack.vgamma_eval_device+1, 3, base_stack.vgamma_pm_eval_device, 1,
+    "vgamma -> vgamma+-", backend);
+  interleave_kernel_input(npts, base_stack.vgamma_eval_device+2, 3, base_stack.vgamma_mm_eval_device, 1,
+    "vgamma -> vgamma--", backend);
+}
+
+template <typename T>
+void deinterleave_mgga_output(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl) {
+  deinterleave_gga_output(npts, base_stack, backend);
+  interleave_kernel_input(npts, base_stack.vtau_eval_device+0, 2, base_stack.vtau_pos_eval_device, 1,
+    "vtau -> vtau+", backend);
+  interleave_kernel_input(npts, base_stack.vtau_eval_device+1, 2, base_stack.vtau_neg_eval_device, 1,
+    "vtau -> vtau-", backend);
+  if(need_lapl) {
+    interleave_kernel_input(npts, base_stack.vlapl_eval_device+0, 2, base_stack.vlapl_pos_eval_device, 1,
+      "vlapl -> vlapl+", backend);
+    interleave_kernel_input(npts, base_stack.vlapl_eval_device+1, 2, base_stack.vlapl_neg_eval_device, 1,
+      "vlapl -> vlapl-", backend);
+  }
+}
+
+template <typename T>
+void deinterleave_vxc_fxc_lda(size_t npts, T& base_stack, DeviceBackend* backend) {
+  // Deinterleave the lda vxc output
+  deinterleave_lda_output(npts, base_stack, backend);
+  interleave_kernel_input(npts, base_stack.v2rho2_eval_device+0, 3, base_stack.v2rho2_a_a_eval_device, 1,
+    "v2rho2 -> v2rho2_aa", backend);
+  interleave_kernel_input(npts, base_stack.v2rho2_eval_device+1, 3, base_stack.v2rho2_a_b_eval_device, 1,
+    "v2rho2 -> v2rho2_ab", backend);
+  interleave_kernel_input(npts, base_stack.v2rho2_eval_device+2, 3, base_stack.v2rho2_b_b_eval_device, 1,
+    "v2rho2 -> v2rho2_bb", backend);
+}
+
+template <typename T>
+void deinterleave_vxc_fxc_gga(size_t npts, T& base_stack, DeviceBackend* backend) {
+  deinterleave_vxc_fxc_lda(npts, base_stack, backend);
+  // Deinterleave the gga vxc output
+  deinterleave_gga_output(npts, base_stack, backend);
+  
+  interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+0, 6, base_stack.v2rhogamma_a_aa_eval_device, 1,
+    "v2rhogamma -> v2rhogamma_a_aa", backend);
+  interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+1, 6, base_stack.v2rhogamma_a_ab_eval_device, 1,
+    "v2rhogamma -> v2rhogamma_a_ab", backend);
+  interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+2, 6, base_stack.v2rhogamma_a_bb_eval_device, 1,
+    "v2rhogamma -> v2rhogamma_a_bb", backend);
+  interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+3, 6, base_stack.v2rhogamma_b_aa_eval_device, 1,
+    "v2rhogamma -> v2rhogamma_b_aa", backend);
+  interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+4, 6, base_stack.v2rhogamma_b_ab_eval_device, 1,
+    "v2rhogamma -> v2rhogamma_b_ab", backend);
+  interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+5, 6, base_stack.v2rhogamma_b_bb_eval_device, 1,
+    "v2rhogamma -> v2rhogamma_b_bb", backend);
+  interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+0, 6, base_stack.v2gamma2_aa_aa_eval_device, 1,
+    "v2gamma2 -> v2gamma2_aa_aa", backend);
+  interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+1, 6, base_stack.v2gamma2_aa_ab_eval_device, 1,
+    "v2gamma2 -> v2gamma2_aa_ab", backend);
+  interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+2, 6, base_stack.v2gamma2_aa_bb_eval_device, 1,
+    "v2gamma2 -> v2gamma2_aa_bb", backend);
+  interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+3, 6, base_stack.v2gamma2_ab_ab_eval_device, 1,
+    "v2gamma2 -> v2gamma2_ab_ab", backend);
+  interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+4, 6, base_stack.v2gamma2_ab_bb_eval_device, 1,
+    "v2gamma2 -> v2gamma2_ab_bb", backend);
+  interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+5, 6, base_stack.v2gamma2_bb_bb_eval_device, 1,
+    "v2gamma2 -> v2gamma2_bb_bb", backend);
+}
+
+template <typename T>
+void deinterleave_vxc_fxc_mgga(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl) {
+  deinterleave_vxc_fxc_gga(npts, base_stack, backend);
+  // Deinterleave the mgga vxc output
+  deinterleave_mgga_output(npts, base_stack, backend, need_lapl);
+  
+  interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+0, 4, base_stack.v2rhotau_a_a_eval_device, 1,
+    "v2rhotau -> v2rhotau_a_a", backend);
+  interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+1, 4, base_stack.v2rhotau_a_b_eval_device, 1,
+    "v2rhotau -> v2rhotau_a_b", backend);
+  interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+2, 4, base_stack.v2rhotau_b_a_eval_device, 1,
+    "v2rhotau -> v2rhotau_b_a", backend);
+  interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+3, 4, base_stack.v2rhotau_b_b_eval_device, 1,
+    "v2rhotau -> v2rhotau_b_b", backend);
+  interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+0, 6, base_stack.v2gammatau_aa_a_eval_device, 1,
+    "v2gammatau -> v2gammatau_aa_a", backend);
+  interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+1, 6, base_stack.v2gammatau_aa_b_eval_device, 1,
+    "v2gammatau -> v2gammatau_aa_b", backend);
+  interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+2, 6, base_stack.v2gammatau_ab_a_eval_device, 1,
+    "v2gammatau -> v2gammatau_ab_a", backend);
+  interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+3, 6, base_stack.v2gammatau_ab_b_eval_device, 1,
+    "v2gammatau -> v2gammatau_ab_b", backend);
+  interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+4, 6, base_stack.v2gammatau_bb_a_eval_device, 1,
+    "v2gammatau -> v2gammatau_bb_a", backend);
+  interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+5, 6, base_stack.v2gammatau_bb_b_eval_device, 1,
+    "v2gammatau -> v2gammatau_bb_b", backend);
+  interleave_kernel_input(npts, base_stack.v2tau2_eval_device+0, 3, base_stack.v2tau2_a_a_eval_device, 1,
+    "v2tau2 -> v2tau2_a_a", backend);
+  interleave_kernel_input(npts, base_stack.v2tau2_eval_device+1, 3, base_stack.v2tau2_a_b_eval_device, 1,
+    "v2tau2 -> v2tau2_a_b", backend);
+  interleave_kernel_input(npts, base_stack.v2tau2_eval_device+2, 3, base_stack.v2tau2_b_b_eval_device, 1,
+    "v2tau2 -> v2tau2_b_b", backend);
+  
+  if (need_lapl) {
+    interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+0, 4, base_stack.v2rholapl_a_a_eval_device, 1,
+      "v2rholapl -> v2rholapl_a_a", backend);
+    interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+1, 4, base_stack.v2rholapl_a_b_eval_device, 1,
+      "v2rholapl -> v2rholapl_a_b", backend);
+    interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+2, 4, base_stack.v2rholapl_b_a_eval_device, 1,
+      "v2rholapl -> v2rholapl_b_a", backend);
+    interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+3, 4, base_stack.v2rholapl_b_b_eval_device, 1,
+      "v2rholapl -> v2rholapl_b_b", backend);
+    interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+0, 6, base_stack.v2gammalapl_aa_a_eval_device, 1,
+      "v2gammalapl -> v2gammalapl_aa_a", backend);
+    interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+1, 6, base_stack.v2gammalapl_aa_b_eval_device, 1,
+      "v2gammalapl -> v2gammalapl_aa_b", backend);
+    interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+2, 6, base_stack.v2gammalapl_ab_a_eval_device, 1,
+      "v2gammalapl -> v2gammalapl_ab_a", backend);
+    interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+3, 6, base_stack.v2gammalapl_ab_b_eval_device, 1,
+      "v2gammalapl -> v2gammalapl_ab_b", backend);
+    interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+4, 6, base_stack.v2gammalapl_bb_a_eval_device, 1,
+      "v2gammalapl -> v2gammalapl_bb_a", backend);
+    interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+5, 6, base_stack.v2gammalapl_bb_b_eval_device, 1,
+      "v2gammalapl -> v2gammalapl_bb_b", backend);
+    interleave_kernel_input(npts, base_stack.v2lapl2_eval_device+0, 3, base_stack.v2lapl2_a_a_eval_device, 1,
+      "v2lapl2 -> v2lapl2_a_a", backend);
+    interleave_kernel_input(npts, base_stack.v2lapl2_eval_device+1, 3, base_stack.v2lapl2_a_b_eval_device, 1,
+      "v2lapl2 -> v2lapl2_a_b", backend);
+    interleave_kernel_input(npts, base_stack.v2lapl2_eval_device+2, 3, base_stack.v2lapl2_b_b_eval_device, 1,
+      "v2lapl2 -> v2lapl2_b_b", backend);
+    interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+0, 4, base_stack.v2lapltau_a_a_eval_device, 1,
+      "v2lapltau -> v2lapltau_a_a", backend);
+    interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+1, 4, base_stack.v2lapltau_a_b_eval_device, 1,
+      "v2lapltau -> v2lapltau_a_b", backend);
+    interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+2, 4, base_stack.v2lapltau_b_a_eval_device, 1,
+      "v2lapltau -> v2lapltau_b_a", backend);
+    interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+3, 4, base_stack.v2lapltau_b_b_eval_device, 1,
+      "v2lapltau -> v2lapltau_b_b", backend);
+  }
+}
+
+template <typename T>
+void scale_lda_output(size_t npts, T& base_stack, DeviceBackend* backend, bool is_pol) {
+  hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+    base_stack.eps_eval_device, 1); 
+  if(is_pol) {
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vrho_pos_eval_device, 1); 
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vrho_neg_eval_device, 1); 
+  } else {
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vrho_eval_device, 1); 
+  }
+}
+
+template <typename T>
+void scale_gga_output(size_t npts, T& base_stack, DeviceBackend* backend, bool is_pol) {
+  scale_lda_output(npts, base_stack, backend, is_pol);
+  if(is_pol) {
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vgamma_pp_eval_device, 1); 
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vgamma_pm_eval_device, 1); 
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vgamma_mm_eval_device, 1); 
+  } else {
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vgamma_eval_device, 1); 
+  }
+}
+
+template <typename T>
+void scale_mgga_output(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl, bool is_pol) {
+  scale_gga_output(npts, base_stack, backend, is_pol);
+  if(is_pol) {
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vtau_pos_eval_device, 1); 
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vtau_neg_eval_device, 1); 
+    if(need_lapl) {
+      hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+        base_stack.vlapl_pos_eval_device, 1); 
+      hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+        base_stack.vlapl_neg_eval_device, 1); 
+    }
+  } else {
+    hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+      base_stack.vtau_eval_device, 1); 
+    if(need_lapl) {
+      hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, 
+        base_stack.vlapl_eval_device, 1); 
+    }
+  }
+}
+
+
+void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, 
+  XCDeviceData* _data ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!");
+
+  auto base_stack    = data->base_stack;
+
+  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol = is_UKS or is_GKS;
+  const bool is_excgrad = data->allocated_terms.exc_grad;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  auto* den_eval_ptr = base_stack.den_s_eval_device;
+
+  if ( is_pol ) {
+    den_eval_ptr = base_stack.den_interleaved_device;
+    interleave_lda_input(npts, base_stack, data->device_backend_);
+  }
+
+  GauXC::eval_kern_exc_vxc_lda( func, npts,
+    den_eval_ptr, base_stack.eps_eval_device, 
+    base_stack.vrho_eval_device, data->device_backend_->queue() );
+
+  if(is_pol) deinterleave_lda_output(npts, base_stack, data->device_backend_);
+  scale_lda_output(npts, base_stack, data->device_backend_, is_pol);
+  
+  data->device_backend_->check_error("exc_vxc lda" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+
+void AoSScheme1Base::eval_kern_exc_vxc_gga( const functional_type& func, 
+  XCDeviceData* _data ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!");
+
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr = base_stack.den_s_eval_device;
+  
+  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  const bool is_excgrad = data->allocated_terms.exc_grad;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  if(is_pol) {
+    den_eval_ptr = base_stack.den_interleaved_device;
+    interleave_gga_input(npts, base_stack, data->device_backend_);
+  }
+
+  GauXC::eval_kern_exc_vxc_gga( func, data->total_npts_task_batch, 
+    den_eval_ptr, base_stack.gamma_eval_device, 
+    base_stack.eps_eval_device, base_stack.vrho_eval_device, 
+    base_stack.vgamma_eval_device, data->device_backend_->queue() );
+
+  if(is_pol) deinterleave_gga_output(npts, base_stack, data->device_backend_);
+  scale_gga_output(npts, base_stack, data->device_backend_, is_pol);
+
+  data->device_backend_->check_error("exc_vxc gga" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+
+void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func, 
+  XCDeviceData* _data ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  if( !func.is_mgga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not MGGA!");
+
+  auto base_stack       = data->base_stack;
+  double* den_eval_ptr  = base_stack.den_s_eval_device;
+  double* tau_eval_ptr  = base_stack.tau_s_eval_device;
+  double* lapl_eval_ptr = base_stack.lapl_s_eval_device;
+  
+  const bool is_RKS = data->allocated_terms.ks_scheme == RKS;
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+  const bool is_excgrad = data->allocated_terms.exc_grad;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  if(is_pol) {
+    den_eval_ptr = base_stack.den_interleaved_device;
+    tau_eval_ptr = base_stack.tau_interleaved_device;
+    lapl_eval_ptr = base_stack.lapl_interleaved_device;
+    interleave_mgga_input(npts, base_stack, data->device_backend_, func.needs_laplacian());
+  }
 
   GauXC::eval_kern_exc_vxc_mgga( func, data->total_npts_task_batch, 
-    base_stack.den_s_eval_device, base_stack.gamma_eval_device, 
-    base_stack.tau_eval_device, base_stack.den_lapl_eval_device,
+    den_eval_ptr, base_stack.gamma_eval_device, 
+    tau_eval_ptr, lapl_eval_ptr,
     base_stack.eps_eval_device, base_stack.vrho_eval_device, 
     base_stack.vgamma_eval_device, base_stack.vtau_eval_device,
     base_stack.vlapl_eval_device, data->device_backend_->queue() );
+
+  if(is_pol) deinterleave_mgga_output(npts, base_stack, data->device_backend_, func.needs_laplacian());
+  scale_mgga_output(npts, base_stack, data->device_backend_, func.needs_laplacian(), is_pol);
+  
   data->device_backend_->check_error("exc_vxc mgga" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+
+void AoSScheme1Base::eval_kern_vxc_fxc_lda( const functional_type& func, 
+  XCDeviceData* _data ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.eps_eval_device, 1 );
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 );
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vgamma_eval_device, 1 );
-  hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                    base_stack.weights_device, 1, base_stack.vtau_eval_device, 1 );
-  if(func.needs_laplacian()) {
-    hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, 
-                      base_stack.weights_device, 1, base_stack.vlapl_eval_device, 1 );
+  if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!");
+
+  auto base_stack    = data->base_stack;
+
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol = is_UKS or is_GKS;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  auto* den_eval_ptr = base_stack.den_s_eval_device;
+
+  if ( is_pol ) {
+    den_eval_ptr = base_stack.den_interleaved_device;
+    interleave_lda_input(npts, base_stack, data->device_backend_);
   }
 
+  GauXC::eval_kern_vxc_fxc_lda( func, npts,
+    den_eval_ptr, base_stack.vrho_eval_device, 
+    base_stack.v2rho2_eval_device, data->device_backend_->queue() );
+
+  if(is_pol) deinterleave_vxc_fxc_lda(npts, base_stack, data->device_backend_);
+  // For 2nd derivative, we do not scale the output
+  // We will multiply it with the weights to the intermediate outputs A, B, C 
   
-  data->device_backend_->check_error("exc_vxc mgga" __FILE__ ": " + std::to_string(__LINE__));
+  data->device_backend_->check_error("exc_vxc_fxc lda" __FILE__ ": " + std::to_string(__LINE__));
 }
 
 
+void AoSScheme1Base::eval_kern_vxc_fxc_gga( const functional_type& func, 
+  XCDeviceData* _data ) {
 
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
+  if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!");
+
+  auto base_stack    = data->base_stack;
+  double* den_eval_ptr = base_stack.den_s_eval_device;
+  
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
 
+  const size_t npts = data->total_npts_task_batch ;
+  
+  if(is_pol) {
+    den_eval_ptr = base_stack.den_interleaved_device;
+    interleave_gga_input(npts, base_stack, data->device_backend_);
+  }
 
+  GauXC::eval_kern_vxc_fxc_gga( func, npts, 
+    den_eval_ptr, base_stack.gamma_eval_device,
+    base_stack.vrho_eval_device, base_stack.vgamma_eval_device,
+    base_stack.v2rho2_eval_device, base_stack.v2rhogamma_eval_device, base_stack.v2gamma2_eval_device,
+    data->device_backend_->queue() );
 
+  if(is_pol) deinterleave_vxc_fxc_gga(npts, base_stack, data->device_backend_);
+  
+  // For 2nd derivative, we do not scale the output
+  // We will multiply it with the weights to the intermediate outputs A, B, C 
 
-void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){
+  
+  data->device_backend_->check_error("exc_vxc_fxc gga" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+
+void AoSScheme1Base::eval_kern_vxc_fxc_mgga( const functional_type& func, 
+  XCDeviceData* _data ) {
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  if( !func.is_mgga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not MGGA!");
+
+  auto base_stack       = data->base_stack;
+  double* den_eval_ptr  = base_stack.den_s_eval_device;
+  double* tau_eval_ptr  = base_stack.tau_s_eval_device;
+  double* lapl_eval_ptr = base_stack.lapl_s_eval_device;
+  
+  const bool is_UKS = data->allocated_terms.ks_scheme == UKS;
+  const bool is_GKS = data->allocated_terms.ks_scheme == GKS;
+  const bool is_pol  = is_UKS or is_GKS;
+
+  const size_t npts = data->total_npts_task_batch ;
+  
+  if(is_pol) {
+    den_eval_ptr = base_stack.den_interleaved_device;
+    tau_eval_ptr = base_stack.tau_interleaved_device;
+    lapl_eval_ptr = base_stack.lapl_interleaved_device;
+    interleave_mgga_input(npts, base_stack, data->device_backend_, func.needs_laplacian());
+  }
+
+  GauXC::eval_kern_vxc_fxc_mgga( func, npts, 
+    den_eval_ptr, base_stack.gamma_eval_device, 
+    lapl_eval_ptr, tau_eval_ptr, 
+    base_stack.vrho_eval_device, base_stack.vgamma_eval_device, 
+    base_stack.vlapl_eval_device, base_stack.vtau_eval_device,
+    base_stack.v2rho2_eval_device, base_stack.v2rhogamma_eval_device,
+    base_stack.v2rholapl_eval_device, base_stack.v2rhotau_eval_device,
+    base_stack.v2gamma2_eval_device, base_stack.v2gammalapl_eval_device,
+    base_stack.v2gammatau_eval_device, base_stack.v2lapl2_eval_device,
+    base_stack.v2lapltau_eval_device, base_stack.v2tau2_eval_device,
+    data->device_backend_->queue() );
+
+  if(is_pol) deinterleave_vxc_fxc_mgga(npts, base_stack, data->device_backend_, func.needs_laplacian());
+  
+  // For 2nd derivative, we do not scale the output
+  // We will multiply it with the weights to the intermediate outputs A, B, C 
+  
+  data->device_backend_->check_error("exc_vxc_fxc mgga" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+template<bool is_trial>
+void AoSScheme1Base::eval_xmat_impl( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -849,22 +1616,12 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, d
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  double* dmat_ptr = nullptr;
-  switch ( den_select ) {
-    case DEN_S:
-      dmat_ptr = static_stack.dmat_s_device;
-      break;
-    case DEN_Z:
-      dmat_ptr = static_stack.dmat_z_device;
-      break;
-    case DEN_X:
-      dmat_ptr = static_stack.dmat_x_device;
-      break;
-    case DEN_Y:
-      dmat_ptr = static_stack.dmat_y_device;
-      break;
-    default:
-      GAUXC_GENERIC_EXCEPTION("eval_xmat: den_select not set");
+  double * dmat_ptr;
+  if constexpr (is_trial) {
+    dmat_ptr = static_stack.tden_selector(den_select);
+    // now screened trial density matrix is stored in aos_stack.device_tasks[itask].nbe_scr
+  } else {
+    dmat_ptr = static_stack.den_selector(den_select);
   }
 
   // Pack density matrix 
@@ -900,20 +1657,59 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, d
   }
 
   
-  data->device_backend_->check_error("xmat" __FILE__ ": " + std::to_string(__LINE__));
+  data->device_backend_->check_error("xmat impl" __FILE__ ": " + std::to_string(__LINE__));
   // Record completion of BLAS ops on master stream
   data->device_backend_->sync_master_with_blas_pool();
 
 }
 
 
+void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){
+  eval_xmat_impl<false>(fac, _data, do_grad, den_select);
+}
+void AoSScheme1Base::eval_xmat_trial( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){
+  eval_xmat_impl<true>(fac, _data, do_grad, den_select);
+}
+
+void AoSScheme1Base::save_xmat( XCDeviceData* _data, bool do_grad, density_id den_select ){
+
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+  auto backend = data->device_backend_;
 
+  auto aos_stack     = data->aos_stack;
+  const auto sz = data->total_nbe_bfn_npts_task_batch;
+
+  switch(den_select) {
+    case DEN_S:
+      backend->copy_async(sz, aos_stack.zmat_vxc_device, aos_stack.xmatS_device, "xmatS copy");
+      if(do_grad) {
+        backend->copy_async(sz, aos_stack.xmat_dx_device, aos_stack.xmatS_dx_device, "xmatS_dx copy");
+        backend->copy_async(sz, aos_stack.xmat_dy_device, aos_stack.xmatS_dy_device, "xmatS_dy copy");
+        backend->copy_async(sz, aos_stack.xmat_dz_device, aos_stack.xmatS_dz_device, "xmatS_dz copy");
+      }
+      break;
+    case DEN_Z:
+      backend->copy_async(sz, aos_stack.zmat_vxc_device, aos_stack.xmatZ_device, "xmatZ copy");
+      if(do_grad) {
+        backend->copy_async(sz, aos_stack.xmat_dx_device, aos_stack.xmatZ_dx_device, "xmatZ_dx copy");
+        backend->copy_async(sz, aos_stack.xmat_dy_device, aos_stack.xmatZ_dy_device, "xmatZ_dy copy");
+        backend->copy_async(sz, aos_stack.xmat_dz_device, aos_stack.xmatZ_dz_device, "xmatZ_dz copy");
+      }
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION("Save XMat + GKS NYI");
+  }
+}
 
 
 
 
 
-void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool do_m ){
+template<bool is_fxc>
+void AoSScheme1Base::inc_potential_impl( XCDeviceData* _data, density_id den_selector, bool do_m ){
 
   auto* data = dynamic_cast<Data*>(_data);
   if( !data ) GAUXC_BAD_LWD_DATA_CAST();
@@ -952,28 +1748,30 @@ void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool
   const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 );
   auto static_stack  = data->static_stack;
   auto aos_stack     = data->aos_stack;
-  double* vxc_ptr    = nullptr;
-  switch( den_selector ) {
-    case DEN_S:
-      vxc_ptr = static_stack.vxc_s_device;
-      break;
-    case DEN_Z:
-      vxc_ptr = static_stack.vxc_z_device;
-      break;
-    case DEN_Y:
-      vxc_ptr = static_stack.vxc_y_device;
-      break;
-    case DEN_X:
-      vxc_ptr = static_stack.vxc_x_device;
-      break;
-    default:
-      GAUXC_GENERIC_EXCEPTION( "inc_vxc called with invalid density selected" );
+  
+  double* potential_ptr;
+  if constexpr (is_fxc) {
+    potential_ptr = static_stack.fxc_selector(den_selector);
+    // cutlass_stack.vmat_array_device points to aos_stack.device_tasks[itask].nbe_scr
+  } else {
+    potential_ptr = static_stack.vxc_selector(den_selector);
   }
+
+  auto vxc_ptr = static_stack.vxc_selector(den_selector);
   sym_task_inc_potential( ntasks, aos_stack.device_tasks,
-    vxc_ptr, nbf, submat_block_size,
+    potential_ptr, nbf, submat_block_size,
     data->device_backend_->queue() );
   
-  data->device_backend_->check_error("inc_vxc" __FILE__ ": " + std::to_string(__LINE__));
+  data->device_backend_->check_error("inc_potential_ptr" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+
+void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool do_m ){
+  inc_potential_impl<false>(_data, den_selector, do_m);
+}
+
+void AoSScheme1Base::inc_fxc( XCDeviceData* _data, density_id den_selector, bool do_m ){
+  inc_potential_impl<true>(_data, den_selector, do_m);
 }
 
 
@@ -1019,10 +1817,43 @@ void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data, density_id den_selecto
   data->device_backend_->check_error("symmetrize vxc" __FILE__ ": " + std::to_string(__LINE__));
 }
 
+void AoSScheme1Base::symmetrize_fxc( XCDeviceData* _data, density_id den_selector) {
 
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
 
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
-void AoSScheme1Base::inc_exc_grad_lda( XCDeviceData* _data ) {
+  const auto nbf = data->global_dims.nbf;
+  auto static_stack  = data->static_stack;
+  switch ( den_selector ) {
+    case DEN_S:
+      symmetrize_matrix( nbf, static_stack.fxc_s_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    case DEN_Z:
+      symmetrize_matrix( nbf, static_stack.fxc_z_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    case DEN_Y:
+      symmetrize_matrix( nbf, static_stack.fxc_y_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    case DEN_X:
+      symmetrize_matrix( nbf, static_stack.fxc_x_device, nbf, 
+            data->device_backend_->queue() ); 
+      break;
+    default:
+      GAUXC_GENERIC_EXCEPTION( "symmetrize_fxc: invalid density selected" );
+  }
+  
+  data->device_backend_->check_error("symmetrize fxc" __FILE__ ": " + std::to_string(__LINE__));
+}
+
+
+
+
+void AoSScheme1Base::inc_exc_grad_lda( XCDeviceData* _data, integrator_ks_scheme ks_scheme, bool with_weight_derivatives ) {
 #ifdef GAUXC_HAS_HIP
   GAUXC_GENERIC_EXCEPTION("LDA Grad NYI for HIP Backends");
 #else
@@ -1032,15 +1863,16 @@ void AoSScheme1Base::inc_exc_grad_lda( XCDeviceData* _data ) {
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
   const auto nshell = data->global_dims.nshells;
-  increment_exc_grad_lda( nshell, 
+  increment_exc_grad_lda( ks_scheme, nshell, 
     data->shell_to_task_stack.shell_to_task_device, 
     data->aos_stack.device_tasks,
     data->static_stack.exc_grad_device,
+    with_weight_derivatives,
     data->device_backend_->queue() ); 
 #endif
 }
 
-void AoSScheme1Base::inc_exc_grad_gga( XCDeviceData* _data ) {
+void AoSScheme1Base::inc_exc_grad_gga( XCDeviceData* _data, integrator_ks_scheme ks_scheme, bool with_weight_derivatives ) {
 #ifdef GAUXC_HAS_HIP
   GAUXC_GENERIC_EXCEPTION("GGA Grad NYI for HIP Backends");
 #else
@@ -1050,10 +1882,30 @@ void AoSScheme1Base::inc_exc_grad_gga( XCDeviceData* _data ) {
   if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
 
   const auto nshell = data->global_dims.nshells;
-  increment_exc_grad_gga( nshell, 
+  increment_exc_grad_gga( ks_scheme, nshell, 
     data->shell_to_task_stack.shell_to_task_device, 
     data->aos_stack.device_tasks,
     data->static_stack.exc_grad_device,
+    with_weight_derivatives,
+    data->device_backend_->queue() ); 
+#endif
+}
+
+void AoSScheme1Base::inc_exc_grad_mgga( XCDeviceData* _data, integrator_ks_scheme ks_scheme, bool need_lapl, bool with_weight_derivatives ) {
+#ifdef GAUXC_HAS_HIP
+  GAUXC_GENERIC_EXCEPTION("MGGA Grad NYI for HIP Backends");
+#else
+  auto* data = dynamic_cast<Data*>(_data);
+  if( !data ) GAUXC_BAD_LWD_DATA_CAST();
+
+  if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND();
+
+  const auto nshell = data->global_dims.nshells;
+  increment_exc_grad_mgga( ks_scheme, nshell, need_lapl,
+    data->shell_to_task_stack.shell_to_task_device, 
+    data->aos_stack.device_tasks,
+    data->static_stack.exc_grad_device,
+    with_weight_derivatives,
     data->device_backend_->queue() ); 
 #endif
 }
@@ -1119,15 +1971,23 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
   // XXX: Need to add screening capabilities, packing etc
   //const auto nbf = data->global_dims.nbf;
 
-  // XXX: Need to add support for non-cartesian functions
-  for( auto i = 0ul; i < nshells; ++i ) {
-    if( basis_map.shell_pure(i) )
-      GAUXC_GENERIC_EXCEPTION("GPU EXX + Spherical NYI");
-  }
 
   if( basis_map.max_l() > 2 ) {
     GAUXC_GENERIC_EXCEPTION("GPU EXX + L>2 NYI");
   }
+  
+  // Determine purity of shell types
+  std::vector<bool> sph_am(basis_map.max_l()+1);
+  for( auto i = 0ul; i < nshells; ++i ) {
+    sph_am[basis_map.shell_l(i)] =  sph_am[basis_map.shell_l(i)] | basis_map.shell_pure(i);
+  }
+
+  // Sanity Check
+  for( auto i = 0ul; i < nshells; ++i ) {
+    if(basis_map.shell_pure(i) != sph_am[basis_map.shell_l(i)])
+      GAUXC_GENERIC_EXCEPTION("GPU EXX requires all shells of the same angular momentum to have the same purity");
+  }
+  
 
   // Zero out G
   for( auto& task : tasks ) {
@@ -1167,7 +2027,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     data->device_backend_->check_error("integral_0_task_batched" __FILE__ ": " + std::to_string(__LINE__));
     if(basis_map.max_l() > 0) {
     XGPU::integral_1_task_batched(
-      tasks.size(), data->subtask.size(),
+      sph_am[1], tasks.size(), data->subtask.size(),
       data->l_batch_diag_task_to_shell_pair_device[1].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
       data->l_batch_diag_task_to_shell_pair_device[1].task_to_shell_pair_device,
@@ -1183,7 +2043,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
     if(basis_map.max_l() > 1) {
     XGPU::integral_2_task_batched(
-      tasks.size(), data->subtask.size(),
+      sph_am[2], tasks.size(), data->subtask.size(),
       data->l_batch_diag_task_to_shell_pair_device[2].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
       data->l_batch_diag_task_to_shell_pair_device[2].task_to_shell_pair_device,
@@ -1217,7 +2077,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
 
     if(basis_map.max_l() > 0) {
     XGPU::integral_1_1_task_batched(
-      tasks.size(), data->subtask.size(),
+      sph_am[1], tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,1)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,1)].task_to_shell_pair_device,
@@ -1234,7 +2094,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
 
     if(basis_map.max_l() > 1) {
     XGPU::integral_2_2_task_batched(
-      tasks.size(), data->subtask.size(),
+      sph_am[2], tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,2)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,2)].task_to_shell_pair_device,
@@ -1250,7 +2110,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
 
     if(basis_map.max_l() > 0) {
-    XGPU::integral_1_0_task_batched( true,
+    XGPU::integral_1_0_task_batched( true, sph_am[1],
       tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(0,1)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
@@ -1267,7 +2127,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
 
     if(basis_map.max_l() > 0) {
-    XGPU::integral_1_0_task_batched( false,
+    XGPU::integral_1_0_task_batched( false, sph_am[1],
       tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,0)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
@@ -1284,7 +2144,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
 
     if(basis_map.max_l() > 1) {
-    XGPU::integral_2_0_task_batched( true,
+    XGPU::integral_2_0_task_batched( true, sph_am[2],
       tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(0,2)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
@@ -1301,7 +2161,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
 
     if(basis_map.max_l() > 1) {
-    XGPU::integral_2_0_task_batched( false,
+    XGPU::integral_2_0_task_batched( false, sph_am[2],
       tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,0)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
@@ -1318,7 +2178,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
 
     if(basis_map.max_l() > 1) {
-    XGPU::integral_2_1_task_batched( true,
+    XGPU::integral_2_1_task_batched( true, sph_am[2], sph_am[1],
       tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,2)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
@@ -1335,7 +2195,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data,
     }
 
     if(basis_map.max_l() > 1) {
-    XGPU::integral_2_1_task_batched( false,
+    XGPU::integral_2_1_task_batched( false, sph_am[2], sph_am[1],
       tasks.size(), data->subtask.size(),
       data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,1)].max_prim_pairs, 0,
       data->aos_stack.device_tasks,
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp
index 37964914..6a04d436 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -18,25 +22,46 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL {
   void eval_collocation_gradient( XCDeviceData* ) override final;
   void eval_collocation_hessian( XCDeviceData* ) override final;
   void eval_collocation_laplacian( XCDeviceData* ) override final;
+  void eval_collocation_lapgrad( XCDeviceData* ) override final;
 
   void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) override final;
   void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) override final;
-  void eval_uvars_mgga( XCDeviceData*, bool ) override final;
-  void eval_vvar( XCDeviceData*, density_id, bool ) override final;
+  void eval_uvars_mgga( XCDeviceData*, integrator_ks_scheme, bool ) override final;
+  void eval_vvars_lda ( XCDeviceData*, density_id ) override final;
+  void eval_vvars_gga ( XCDeviceData*, density_id ) override final;
+  void eval_vvars_mgga( XCDeviceData*, density_id, bool ) override final;
+
+  void eval_tmat_lda( XCDeviceData*, integrator_ks_scheme ) override final;
+  void eval_tmat_gga( XCDeviceData*, integrator_ks_scheme ) override final;
+  void eval_tmat_mgga( XCDeviceData*, integrator_ks_scheme, bool ) override final;
+  void eval_vvars_lda_trial ( XCDeviceData*, density_id ) override final;
+  void eval_vvars_gga_trial ( XCDeviceData*, density_id ) override final;
+  void eval_vvars_mgga_trial( XCDeviceData*, density_id, bool ) override final;
 
   void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final;
   void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final;
-  void eval_zmat_mgga_vxc( XCDeviceData*, bool ) override final;
-  void eval_mmat_mgga_vxc( XCDeviceData*, bool ) override final;
+  void eval_zmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) override final;
+  void eval_mmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) override final;
+
+  void eval_zmat_lda_fxc( XCDeviceData*, density_id ) override final;
+  void eval_zmat_gga_fxc( XCDeviceData*, density_id ) override final;
+  void eval_zmat_mgga_fxc( XCDeviceData*, bool, density_id ) override final;
+  void eval_mmat_mgga_fxc( XCDeviceData*, bool, density_id ) override final;
 
   void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) override final;
   void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) override final;
   void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ) override final;
+  void eval_kern_vxc_fxc_lda( const functional_type&, XCDeviceData* ) override final;
+  void eval_kern_vxc_fxc_gga( const functional_type&, XCDeviceData* ) override final;
+  void eval_kern_vxc_fxc_mgga( const functional_type&, XCDeviceData* ) override final;
+
   void inc_exc( XCDeviceData* ) override final;
   void inc_nel( XCDeviceData* ) override final;
-  void inc_exc_grad_lda( XCDeviceData* ) override final;
-  void inc_exc_grad_gga( XCDeviceData* ) override final;
+  void inc_exc_grad_lda( XCDeviceData*, integrator_ks_scheme, bool  ) override final;
+  void inc_exc_grad_gga( XCDeviceData*, integrator_ks_scheme, bool  ) override final;
+  void inc_exc_grad_mgga( XCDeviceData*, integrator_ks_scheme , bool, bool ) override final;
   void symmetrize_vxc( XCDeviceData* , density_id) override final;
+  void symmetrize_fxc( XCDeviceData* , density_id) override final;
   void symmetrize_exx_k( XCDeviceData* ) override final;
   //void eval_exx_gmat( XCDeviceData* ) override final;
   void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) override final;
@@ -46,11 +71,19 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL {
     XCDeviceData*, host_task_iterator, host_task_iterator,
     const ShellPairCollection<double>& ) override final;
 
+  void save_xmat( XCDeviceData*, bool do_grad, density_id den ) override final;
 
+  
   // Overridable APIs
+  template<bool is_trial>
+  void eval_xmat_impl(double fac, XCDeviceData*, bool do_grad, density_id );
+  template<bool is_fxc>
+  void inc_potential_impl(XCDeviceData*, density_id, bool do_m);
   virtual void eval_xmat( double fac, XCDeviceData*, bool , density_id ) override;
+  virtual void eval_xmat_trial( double fac, XCDeviceData*, bool , density_id ) override;
   virtual void eval_exx_fmat( XCDeviceData* ) override;
   virtual void inc_vxc( XCDeviceData*, density_id, bool ) override;
+  virtual void inc_fxc( XCDeviceData*, density_id, bool ) override;
   virtual void inc_exx_k( XCDeviceData* ) override;
 
 
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx
index 414218e0..7818a5a8 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -53,7 +57,8 @@ size_t Scheme1DataBase::get_static_mem_requirement() {
     // Task Map
     nsp * sizeof(int32_t) +      // nprim_pairs
     nsp * sizeof(shell_pair*) +  // shell_pair pointer
-    nsp * 3 * sizeof(double);    // X_AB, Y_AB, Z_AB
+    nsp * 3 * sizeof(double) +   // X_AB, Y_AB, Z_AB
+    1024 * 1024;                 // additional memory for alignment padding
 
   return size;
 }
@@ -849,7 +854,7 @@ void Scheme1DataBase::add_extra_to_indirection(
     }
   }
 
-  if( terms.exx or terms.exc_vxc or terms.exc_grad or terms.den or terms.exx_ek_screening ) {
+  if( terms.exx or terms.exc_vxc or terms.exc_grad or terms.den or terms.exx_ek_screening or terms.fxc_contraction ) {
     const size_t total_nshells_bfn = total_nshells_bfn_task_batch * sizeof(size_t);
     buffer_adaptor 
       shell_list_bfn_mem( collocation_stack.shell_list_device, total_nshells_bfn );
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp
index cc8aea6e..870cc6e0 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx
index 8c3c15cc..095564f4 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp
index 99dde319..21242a40 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp
+++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx
index 95b818be..af48b439 100644
--- a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx
+++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/factory.cxx b/src/xc_integrator/local_work_driver/factory.cxx
index 744b58b4..fd6b86ad 100644
--- a/src/xc_integrator/local_work_driver/factory.cxx
+++ b/src/xc_integrator/local_work_driver/factory.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/CMakeLists.txt
index a049c0c1..aa68ae28 100644
--- a/src/xc_integrator/local_work_driver/host/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/host/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/host/blas.cxx b/src/xc_integrator/local_work_driver/host/blas.cxx
index c7d13dfe..d0b74596 100644
--- a/src/xc_integrator/local_work_driver/host/blas.cxx
+++ b/src/xc_integrator/local_work_driver/host/blas.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/blas.hpp b/src/xc_integrator/local_work_driver/host/blas.hpp
index 8aa8e7d3..54a279b5 100644
--- a/src/xc_integrator/local_work_driver/host/blas.hpp
+++ b/src/xc_integrator/local_work_driver/host/blas.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx
index 04cf7b0a..0fa970ba 100644
--- a/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -37,6 +41,16 @@ void LocalHostWorkDriver::partition_weights( XCWeightAlg weight_alg,
 
 }
 
+void LocalHostWorkDriver::eval_weight_1st_deriv_contracted( 
+  XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, 
+  const XCTask& task, const double* w_times_f, double* exc_grad_w ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_weight_1st_deriv_contracted(weight_alg, mol, meta, task, 
+    w_times_f, exc_grad_w);
+
+}
+
 
 // Collocation
 void LocalHostWorkDriver::eval_collocation( size_t npts, size_t nshells, size_t nbe, 
@@ -272,7 +286,6 @@ void LocalHostWorkDriver::eval_zmat_lda_vxc_uks( size_t npts, size_t nbe,
   pimpl_->eval_zmat_lda_vxc_uks(npts, nbe, vrho, basis_eval, Zs, ldzs,
     Zz, ldzz);
 
-
 }
 
 void LocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nbe,
@@ -400,5 +413,158 @@ void LocalHostWorkDriver::inc_vxc( size_t npts, size_t nbf, size_t nbe,
 }
 
 
+// eval_tmat LDA RKS
+void LocalHostWorkDriver::eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* trho, double* A) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_tmat_lda_vxc_rks(npts, v2rho2, trho, A);
+
+}
+
+// eval_tmat GGA RKS
+void LocalHostWorkDriver::eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+  const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_tmat_gga_vxc_rks(npts, vgamma, v2rho2, v2rhogamma, v2gamma2,
+    tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval,
+    dden_z_eval, A, B);
+
+}
+
+// eval_tmat MGGA RKS
+void LocalHostWorkDriver::eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+  const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
+  const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+  const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_tmat_mgga_vxc_rks(npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau,
+    v2gamma2, v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2,
+    tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, ttau, dden_x_eval,
+    dden_y_eval, dden_z_eval, A, B, C);
+
+}
+
+void LocalHostWorkDriver::eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_tmat_lda_vxc_uks(npts, v2rho2, trho, A);
+
+}
+void LocalHostWorkDriver::eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+  const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_tmat_gga_vxc_uks(npts, vgamma, v2rho2, v2rhogamma, v2gamma2,
+    trho, tdden_x_eval, tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval,
+    dden_z_eval, A, B);
+
+}
+void LocalHostWorkDriver::eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+  const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau,
+  const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, 
+  const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_tmat_mgga_vxc_uks(npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau,
+    v2gamma2, v2gammalapl, v2gamma_tau, v2lapl2, v2tau_lapl, v2tau2,
+    trho, tdden_x_eval, tdden_y_eval, tdden_z_eval, ttau, dden_x_eval,
+    dden_y_eval, dden_z_eval, A, B, C);
+
+}
+
+void LocalHostWorkDriver::eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe,
+  const double* vrho, const double* basis_eval, double* Za, size_t ldza,
+  double* Zb, size_t ldzb ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_zmat_lda_vxc_uks_ts(npts, nbe, vrho, basis_eval, Za, ldza,
+    Zb, ldzb);
+
+}
+
+void LocalHostWorkDriver::eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_Bvec_gga_vxc_rks_ts(npts, vgamma, dden_x_eval, dden_y_eval,
+    dden_z_eval, B);
+}
+
+void LocalHostWorkDriver::eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_Bvec_gga_vxc_uks_ts(npts, vgamma, dden_x_eval, dden_y_eval,
+    dden_z_eval, B);
+}
+void LocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+  const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval,
+  double* Z, size_t ldz ){
+
+    throw_if_invalid_pimpl(pimpl_);
+    pimpl_->eval_zmat_gga_vxc_rks_ts(npts, nbf, A, B, basis_eval, dbasis_x_eval,
+      dbasis_y_eval, dbasis_z_eval, Z, ldz);
+}
+
+void LocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+  const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval,
+  double* Za, size_t ldza, double* Zb, size_t ldzb ){
+
+    throw_if_invalid_pimpl(pimpl_);
+    pimpl_->eval_zmat_gga_vxc_uks_ts(npts, nbf, A, B, basis_eval, dbasis_x_eval,
+      dbasis_y_eval, dbasis_z_eval, Za, ldza, Zb, ldzb);
+}
+
+void LocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, 
+  const double* vrho, const double* vgamma, const double* vlapl,
+  const double* basis_eval, 
+  const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval,
+  const double* lbasis_eval, const double* dden_x_eval, 
+  const double* dden_y_eval, const double* dden_z_eval, double* Za, size_t ldza,
+  double* Zb, size_t ldzb) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_zmat_mgga_vxc_uks_ts(npts, nbe, vrho, vgamma, vlapl, basis_eval, dbasis_x_eval,
+    dbasis_y_eval, dbasis_z_eval, lbasis_eval, dden_x_eval, dden_y_eval, dden_z_eval,
+    Za, ldza, Zb, ldzb);
+}
+
+void LocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe,
+  const double* vrho, const double* vgamma, const double* basis_eval,
+  const double* dbasis_x_eval, const double* dbasis_y_eval,
+  const double* dbasis_z_eval, const double* dden_x_eval,
+  const double* dden_y_eval, const double* dden_z_eval, double* Za, size_t ldza,
+  double* Zb, size_t ldzb ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_zmat_gga_vxc_uks_ts(npts, nbe, vrho, vgamma, basis_eval, dbasis_x_eval,
+    dbasis_y_eval, dbasis_z_eval, dden_x_eval, dden_y_eval, dden_z_eval,
+    Za, ldza, Zb, ldzb);
+
+}
+void LocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, 
+  const double* vtau, const double* vlapl,
+  const double* dbasis_x_eval, const double* dbasis_y_eval, 
+  const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs, size_t ldms,
+  double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz ) {
+
+  throw_if_invalid_pimpl(pimpl_);
+  pimpl_->eval_mmat_mgga_vxc_uks_ts(npts, nbe, vtau, vlapl, dbasis_x_eval,
+    dbasis_y_eval, dbasis_z_eval, mmat_xs, mmat_ys, mmat_zs, ldms, mmat_xz, mmat_yz,
+    mmat_zz, ldmz );
+
+}
+
+
 
 }
diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp b/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp
index 7b6b73af..41cf430e 100644
--- a/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp
+++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -71,6 +75,20 @@ class LocalHostWorkDriver : public LocalWorkDriver {
   void partition_weights( XCWeightAlg weight_alg, const Molecule& mol, 
     const MolMeta& meta, task_iterator task_begin, task_iterator task_end );
 
+  /** Evaluate the weight first derivative contracted with a function
+   *
+   *  @param[in] weight_alg Molecular partitioning scheme
+   *  @param[in] mol        Molecule being partitioned
+   *  @param[in] molmeta    Metadata associated with mol
+   *  @param[in] task       Task Data
+   *  @param[in] w_times_f  Weight times function evaluation
+   * 
+   *  @param[in/out] exc_grad_w  Weight first derivative times function evaluation (added to this array)
+   *                              Assumed to have length 3*natoms. Example: exc_grad
+   */
+  void eval_weight_1st_deriv_contracted( XCWeightAlg weight_alg, const Molecule& mol, 
+    const MolMeta& meta, const XCTask& task, const double* w_times_f, double* exc_grad_w );
+
 
   /** Evaluation the collocation matrix
    *
@@ -333,7 +351,7 @@ class LocalHostWorkDriver : public LocalWorkDriver {
     double* den_eval, double* dden_x_eval, double* dden_y_eval, double* dden_z_eval, 
     double* gamma, double* tau, double* lapl);
 
-  /** Evaluate the VXC Z Matrix for RKS LDA
+    /** Evaluate the VXC Z Matrix for RKS LDA
    *
    *  Z(mu,i) = 0.5 * vrho(i) * B(mu, i)
    *
@@ -469,6 +487,113 @@ class LocalHostWorkDriver : public LocalWorkDriver {
     const submat_map_t& submat_map, const double* Z, size_t ldz, 
     double* VXC, size_t ldvxc, double* scr );
 
+  /** Evaluate the intermediate vector variables tmat for Fxc contraction of LDA 
+   *
+   *  See Jiashu's notes for details
+   *
+   *  @param[in] npts       The number of points to evaluate the U/V variables
+   *  @param[in] v2rho2     the second derivative of the XC functional wrt rho
+   *  @param[in] trho       The trial density calculated from the trial density matrix
+   *  @param[out] A         intermediate output to form zmat (npts, 1) for RKS, (npts, 2) for UKS
+   *
+   */
+  void eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* trho, double* A);
+  void eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A);
+  
+  /**
+   * Evaluate the intermediate vector variables tmat for Fxc contraction of GGA
+   * 
+   * See Jiashu's notes for details
+   * 
+   * @param[in] npts       The number of points to evaluate the U/V variables
+   * @param[in] vgamma     the derivative of the XC functional wrt gamma
+   * @param[in] v2rho2 the second derivative of the XC functional wrt rho twice
+   * @param[in] v2rhogamma the second derivative of the XC functional wrt rho and gamma
+   * @param[in] v2gamma2 the second derivative of the XC functional wrt gamma twice
+   * @param[in] tden_eval  The trial density calculated from the trial density matrix
+   * @param[in] tdden_x_eval the gradient of the trial density calculated from the trial density matrix, similar for y and z
+   * @param[in] dden_x_eval the gradient of the density (npts) calculated from the density matrix, similar for y and z
+   * @param[out] A      intermediate output to form zmat (npts, 1) for RKS, (npts, 2) for UKS
+   * @param[out] B      intermediate output to form zmat (npts, 3) for RKS, (npts, 6) for UKS
+   */
+  void eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+    const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B );
+  void eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+    const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B );
+  
+  /**
+   *  Evaluate the intermediate vector variables tmat for Fxc contraction of MGGA
+   * 
+   * See Jiashu's notes for details
+   * 
+   * @param[in] npts       The number of points to evaluate the U/V variables
+   * @param[in] vgamma     the derivative of the XC functional wrt gamma
+   * @param[in] v2rho2   the second derivative of the XC functional wrt rho twice
+   * @param[in] v2rhogamma the second derivative of the XC functional wrt rho and gamma
+   * @param[in] v2rholapl the second derivative of the XC functional wrt rho and laplacian
+   * @param[in] v2rhotau  the second derivative of the XC functional wrt rho and tau
+   * @param[in] v2gamma2 the second derivative of the XC functional wrt gamma twice
+   * @param[in] v2gammalapl the second derivative of the XC functional wrt gamma and laplacian
+   * @param[in] v2gammatau the second derivative of the XC functional wrt gamma and tau
+   * @param[in] v2lapl2 the second derivative of the XC functional wrt laplacian twice
+   * @param[in] v2lapltau the second derivative of the XC functional wrt laplacian and tau
+   * @param[in] v2tau2 the second derivative of the XC functional wrt tau twice
+   * @param[in] tden_eval  The trial density calculated from the trial density matrix
+   * @param[in] tdden_x_eval the gradient of the trial density calculated from the trial density matrix, similar for y and z
+   * @param[in] dden_x_eval the gradient of the density (npts) calculated from the density matrix, similar for y and z
+   * @param[in] ttau      the kinetic energy density calculated from the trial density matrix
+   * @param[out] A     intermediate output to form zmat (npts, 1) for RKS, (npts, 2) for UKS
+   * @param[out] B     intermediate output to form zmat (npts, 3) for RKS, (npts, 6) for UKS
+   * @param[out] C     intermediate output to form mmat (npts, 1) for RKS, (npts, 2) for UKS
+   */
+  void eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+    const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
+    const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+    const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C);
+  void eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+    const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau,
+    const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, 
+    const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C);
+
+  
+  void eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* basis_eval, double* Za, size_t ldza, double* Zb,
+    size_t ldzb );
+  void eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B );
+  void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval,
+    double* Za, size_t ldza, double* Zb, size_t ldzb );
+  void eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B );
+  void eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, 
+    double* Z, size_t ldz );
+
+  void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* vgamma, const double* basis_eval, const double* dbasis_x_eval,
+    const double* dbasis_y_eval, const double* dbasis_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval,
+    double* Za, size_t ldza, double* Zb, size_t ldzb );
+  void eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, 
+    const double* vgamma, const double* vlapl, const double* basis_eval, 
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, 
+    const double* lbasis_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval,
+    double* Za, size_t ldza, double* Zb, size_t ldzb );
+  void eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vtau,
+      const double* vlapl, const double* dbasis_x_eval, const double* dbasis_y_eval,
+      const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs,
+      size_t ldms, double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz);
+
 private: 
 
   pimpl_type pimpl_; ///< Implementation
diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx
index c7ffc94f..aac879ea 100644
--- a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx
+++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp
index cdcac019..c5e4182d 100644
--- a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp
+++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -29,6 +33,9 @@ struct LocalHostWorkDriverPIMPL {
 
   virtual void partition_weights( XCWeightAlg weight_alg, const Molecule& mol, 
     const MolMeta& meta, task_iterator task_begin, task_iterator task_end ) = 0;
+    
+  virtual void eval_weight_1st_deriv_contracted( XCWeightAlg weight_alg, const Molecule& mol, 
+    const MolMeta& meta, const XCTask& task, const double* w_times_f, double* exc_grad_w ) = 0;
 
   virtual void eval_collocation( size_t npts, size_t nshells, size_t nbe, 
     const double* pts, const BasisSet<double>& basis, const int32_t* shell_list, 
@@ -170,6 +177,62 @@ struct LocalHostWorkDriverPIMPL {
     const double* basis_eval, const submat_map_t& submat_map, const double* Z, 
     size_t ldz, double* VXC, size_t ldvxc, double* scr ) = 0;
 
+  virtual void eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* tden_eval, double* A) = 0;
+  virtual void eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A) = 0;
+  
+  virtual void eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+    const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) = 0;
+  virtual void eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+    const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) = 0;
+
+  virtual void eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+    const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
+    const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+    const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) = 0;
+  virtual void eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+    const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau,
+    const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, 
+    const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) = 0;
+
+  virtual void eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* basis_eval, double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0;
+    
+  virtual void eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) = 0;
+  virtual void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, 
+    double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0;
+  virtual void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* vgamma, const double* basis_eval, const double* dbasis_x_eval,
+    const double* dbasis_y_eval, const double* dbasis_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval,
+    double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0;
+
+  virtual void eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) = 0;
+  virtual void eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, 
+    double* Z, size_t ldz ) = 0;
+
+  virtual void eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+      const double* vgamma, const double* vlapl, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval,
+      const double* lbasis_eval,
+      const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval,
+      double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0;
+  virtual void eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vtau, 
+    const double* vlapl, const double* dbasis_x_eval, const double* dbasis_y_eval,
+const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs,
+      size_t ldms, double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz ) = 0;
+
 };
 
 
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt
index b103b4d4..ffa52ffe 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp
index 718505b8..bcafebe2 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp
index dca69c4a..bbb5c455 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp
index 561a331f..d056b810 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx
index 7a79fc85..02b4f767 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp
index 42be0db0..8b7cee2a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx
index 31d80a0d..c64d2d54 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp
index 1cdd66b6..5db799bc 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx
index ab3e29a6..6971c1a7 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp
index 1b1c41a2..95f3db8e 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx
index 13038d0e..3638d86a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp
index 8419d9cd..26508811 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx
index 4d7ab1e7..d0e65541 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp
index b8906bbd..5e6df7ce 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx
index ead10e27..ee58d18f 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp
index 5bc89eb4..3f06119b 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx
index 5cc25f38..035be5be 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp
index 2e3c9a6a..187e1666 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx
index c99a4e9a..0343e667 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp
index 19af53f6..a641b325 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx
index 0013a3a0..6904c15d 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp
index a2f23924..6d7beb15 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx
index 0fb99e41..dbd9f500 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp
index 8a1ba8bb..faf5b123 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx
index 8d946235..c3faf7f4 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp
index d2e51280..3e8cb075 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx
index cbfcb0fb..44c3542e 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp
index 5872e178..7211ec7a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx
index 904fc5d0..197e948a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp
index e1d74748..106a4f14 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx
index 0092aef5..7c4a2ec6 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp
index 3c86c167..a69ba836 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx
index 1e49b804..251de89d 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp
index 459f2359..0fc00c9e 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx
index 714ef6d1..67a9cace 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp
index af1c47ec..abb9a2b5 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx
index 6c6a4ebc..1b2f57f1 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp
index 4c12ac6b..590062ba 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx
index 6efdc4f0..6fefd787 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp
index 9a320cd0..4f4e71d8 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx
index 25b02b63..0a88c5dd 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp
index fd642fe5..2cc57370 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx
index 8aa7efbf..e318e860 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp
index 2bc46333..e750cc98 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx
index 01da2812..5aca482a 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp
index 79bc467b..2583fc79 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx
index 305357e3..5fa3c657 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx
index 46154e34..19921718 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx
index ea1671aa..bbb30ddb 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx
index 0ac029a2..ebb02db6 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx
index 826311ce..9d588ba4 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx
index 3b20d5c1..42941ef5 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx
index e6592a94..455f715b 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx
index 0b206e6a..39c51766 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx
index f728c1d5..b992fcc0 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx
index 4af600b0..325b8b3f 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx
index 3ee4380b..5cf97532 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx
index 6ac1f555..e2378e6c 100644
--- a/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx
+++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/reference/collocation.hpp b/src/xc_integrator/local_work_driver/host/reference/collocation.hpp
index 0db9ed9c..bab6a076 100644
--- a/src/xc_integrator/local_work_driver/host/reference/collocation.hpp
+++ b/src/xc_integrator/local_work_driver/host/reference/collocation.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx b/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx
index 4f1583d0..98f53d35 100644
--- a/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/reference/weights.cxx b/src/xc_integrator/local_work_driver/host/reference/weights.cxx
index f9f0eb75..145bfd1c 100644
--- a/src/xc_integrator/local_work_driver/host/reference/weights.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference/weights.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -108,6 +112,8 @@ void reference_becke_weights_host(
 
 }
 
+
+
 void reference_ssf_weights_host(
   const Molecule&        mol,
   const MolMeta&         meta,
@@ -172,6 +178,7 @@ void reference_ssf_weights_host(
 
     // Evaluate unnormalized partition functions 
     std::fill(partitionScratch.begin(),partitionScratch.end(),1.);
+#if 1
     for( size_t iA = 0; iA < natoms; iA++ ) 
     for( size_t jA = 0; jA < iA;     jA++ )
     if( partitionScratch[iA] > integrator::ssf_weight_tol or 
@@ -196,6 +203,24 @@ void reference_ssf_weights_host(
       }
 
     }
+#else
+    for(size_t iA = 0; iA < natoms; ++iA)
+    for(size_t jA = 0; jA < natoms; ++jA) 
+    if(iA != jA and partitionScratch[iA] > integrator::ssf_weight_tol) {
+      const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms];
+      if( fabs(mu) < integrator::magic_ssf_factor<> ) {
+        double g = 0.5 * (1. - gFrisch(mu));
+        partitionScratch[iA] *= g;
+      } else if(mu >= integrator::magic_ssf_factor<>) {
+        partitionScratch[iA] = 0.0;
+      }
+    }
+
+    if(partitionScratch[task.iParent] < std::numeric_limits<double>::epsilon()) {
+      weight = 0;
+      continue;
+    }
+#endif
 
     // Normalization
     double sum = 0.;
@@ -360,4 +385,604 @@ void reference_lko_weights_host(
 
 }
 
+
+/**
+ * 1st derivative which expects weight_deri to be preallocated as (ngrid*natoms*3)
+ */
+void reference_becke_weights_1st_derivative_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  double* weight_deri
+){
+
+  // Becke partition functions
+  auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19
+  auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3
+  auto tBecke = [&](double x) {
+    // for numerical stability (see Jiashu's notes for details)
+    if (x > 1.0 - 1e-4) 
+      return 0.0; 
+    const double p1 = hBecke(x);
+    const double p2 = hBecke(p1);
+    return - 27.0 * (1. + p2) * (1. + p1) * (1. + x) / (1. - x) / (2. + p2) / (2. + p1) / (2. + x);
+  };
+
+  const size_t natoms = mol.natoms();
+  const auto&  RAB    = meta.rab();
+  std::vector<double> partitionScratch( natoms );
+  std::vector<double> atomDist( natoms );
+
+  for( size_t i  = 0; i  < task.points.size(); ++i  ) {
+
+    auto * weight_deri_ith = weight_deri + 3*natoms*i;
+    const size_t iParent = task.iParent;
+
+    //zerofy the derivative
+    std::fill(weight_deri_ith, weight_deri_ith + 3*natoms, 0.);
+    const auto& point  = task.points[i];
+    const auto& weight = task.weights[i];
+
+    // Compute distances of each center to point
+    for(size_t iA = 0; iA < natoms; iA++) {
+
+      const double da_x = point[0] - mol[iA].x;
+      const double da_y = point[1] - mol[iA].y;
+      const double da_z = point[2] - mol[iA].z;
+
+      atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z);
+
+    }
+
+    // Evaluate unnormalized partition functions 
+    std::fill(partitionScratch.begin(),partitionScratch.end(),1.);
+    for( size_t iA = 0; iA < natoms; iA++ ) 
+    for( size_t jA = 0; jA < iA;     jA++ ){
+
+      double mu  = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms];
+      const double g = gBecke(mu);
+
+      partitionScratch[iA] *= 0.5 * (1. - g);
+      partitionScratch[jA] *= 0.5 * (1. + g);
+    }
+
+    double sum = 0.;
+    for( size_t iA = 0; iA < natoms; iA++ )  sum += partitionScratch[iA];
+
+    // calculate derivative now
+    auto * weight_deri_iParent = weight_deri_ith + 3*iParent;
+    for( size_t iB = 0; iB < natoms; iB++ ) {
+      if (iB == iParent) continue;
+      auto * weight_deri_iB = weight_deri_ith + 3*iB;
+      
+      const double uB_x = mol[iB].x - point[0];
+      const double uB_y = mol[iB].y - point[1];
+      const double uB_z = mol[iB].z - point[2];
+
+      const double uBA_x =mol[iB].x - mol[iParent].x;
+      const double uBA_y =mol[iB].y - mol[iParent].y;
+      const double uBA_z =mol[iB].z - mol[iParent].z;
+      const double rAB = RAB[iB + iParent*natoms];
+
+      double mu_AB  = (atomDist[iParent] - atomDist[iB]) / rAB;
+
+      // first term is - coef1 * nabla_B mu_BA
+      double coef1 = tBecke(mu_AB);
+      weight_deri_iB[0] -= coef1 / rAB * (uB_x / atomDist[iB] + mu_AB * uBA_x /rAB);
+      weight_deri_iB[1] -= coef1 / rAB * (uB_y / atomDist[iB] + mu_AB * uBA_y /rAB);
+      weight_deri_iB[2] -= coef1 / rAB * (uB_z / atomDist[iB] + mu_AB * uBA_z /rAB);
+      
+      double term_x = 0.0, term_y = 0.0, term_z = 0.0;
+      // second term is 1/Z *  \sum_{C != B} (P(B)t_BC - P(C)t_CB) nabla_B mu_BC
+      for( size_t iC = 0; iC < natoms; iC++ ){
+        if (iB == iC) continue;
+
+        // coef = (P(B)t_BC - P(C)t_CB)
+        double mu_BC = (atomDist[iB] - atomDist[iC]) / RAB[iC + iB*natoms];
+        double t_BC = tBecke(mu_BC);
+        double t_CB = tBecke(-mu_BC);
+        double coef = partitionScratch[iB] *t_BC - partitionScratch[iC] * t_CB;
+
+        const double rBC = RAB[iC + iB*natoms];
+
+        term_x += coef * ((mol[iB].x - point[0]) / atomDist[iB] / rBC - mu_BC * (mol[iB].x - mol[iC].x) / rBC / rBC);
+        term_y += coef * ((mol[iB].y - point[1]) / atomDist[iB] / rBC - mu_BC * (mol[iB].y - mol[iC].y) / rBC / rBC);
+        term_z += coef * ((mol[iB].z - point[2]) / atomDist[iB] / rBC - mu_BC * (mol[iB].z - mol[iC].z) / rBC / rBC);
+      }
+
+      weight_deri_iB[0] -= term_x / sum;
+      weight_deri_iB[1] -= term_y / sum;
+      weight_deri_iB[2] -= term_z / sum;
+
+      // Use translational invariance to calculate the derivative for the parent atom
+      weight_deri_iParent[0] -= weight_deri_iB[0];
+      weight_deri_iParent[1] -= weight_deri_iB[1];
+      weight_deri_iParent[2] -= weight_deri_iB[2];
+
+    }
+    
+    // Finally, scale the derivatives by the weight
+    for( size_t iB = 0; iB < natoms; iB++ ) 
+      for (size_t coord = 0; coord < 3; ++coord) 
+        weight_deri_ith[3*iB + coord] *= weight;
+      
+  } 
+}
+
+void reference_ssf_weights_1st_derivative_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  double* weight_deri
+){
+
+  const auto safe_magic_ssf_bound = integrator::magic_ssf_factor<> - 1e-4;
+
+  auto gFrisch = [&](double x) {
+    const double s_x  = x / integrator::magic_ssf_factor<>;
+    const double s_x2 = s_x  * s_x;
+    const double s_x3 = s_x  * s_x2;
+    const double s_x5 = s_x3 * s_x2;
+    const double s_x7 = s_x5 * s_x2;
+
+    return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.;
+  };
+  auto tFrisch = [&](double x) {
+    const double s_x  = x / integrator::magic_ssf_factor<>;
+    const double s_x2 = s_x  * s_x;
+    const double s_x3 = s_x  * s_x2;
+    const double numerator = 35. * (s_x3 + 3. * s_x2 + 3. * s_x + 1.);
+    const double denominator = (x - integrator::magic_ssf_factor<>) * (5.*s_x3 + 20.*s_x2 + 29.*s_x + 16.);
+    return numerator / denominator ;
+  };
+
+  const size_t natoms = mol.natoms();
+  const auto&  RAB    = meta.rab();
+  std::vector<double> partitionScratch( natoms );
+  std::vector<double> atomDist( natoms );
+
+  for( size_t i  = 0; i  < task.points.size(); ++i  ) {
+
+    auto * weight_deri_ith = weight_deri + 3*natoms*i;
+
+    //zerofy the derivative
+    std::fill(weight_deri_ith, weight_deri_ith + 3*natoms, 0.);
+    const auto& weight = task.weights[i];
+
+    if (std::abs(weight) < 1.e-12) continue; // weight derivative = 0 when p_A = 0
+    const size_t iParent = task.iParent;
+
+    const auto& point  = task.points[i];
+
+    const auto dist_cutoff = 0.5 * (1-integrator::magic_ssf_factor<>) * task.dist_nearest;
+
+    // Compute dist to parent atom
+    {
+      const double da_x = point[0] - mol[iParent].x;
+      const double da_y = point[1] - mol[iParent].y;
+      const double da_z = point[2] - mol[iParent].z;
+
+      atomDist[iParent] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z);
+    }
+
+    if( atomDist[iParent] < dist_cutoff ) continue; // weight derivative = 0 when p_A = 1
+
+    // Compute distances of each center to point
+    for(size_t iA = 0; iA < natoms; iA++) {
+
+      if( iA == (size_t)iParent ) continue;
+
+      const double da_x = point[0] - mol[iA].x;
+      const double da_y = point[1] - mol[iA].y;
+      const double da_z = point[2] - mol[iA].z;
+
+      atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z);
+
+    }
+
+    // Evaluate unnormalized partition functions 
+    std::fill(partitionScratch.begin(),partitionScratch.end(),1.);
+
+    for( size_t iA = 0; iA < natoms; iA++ ) 
+    for( size_t jA = 0; jA < iA;     jA++ )
+    if( partitionScratch[iA] > integrator::ssf_weight_tol or 
+        partitionScratch[jA] > integrator::ssf_weight_tol ) {
+
+      const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms];
+
+      if( mu <= -integrator::magic_ssf_factor<> ) {
+
+        partitionScratch[jA] = 0.;
+
+      } else if (mu >= integrator::magic_ssf_factor<>) {
+
+        partitionScratch[iA] = 0.;
+
+      } else {
+
+        double g = 0.5 * ( 1. - gFrisch(mu) );
+        partitionScratch[iA] *= g;
+        partitionScratch[jA] *= 1. - g;
+
+      }
+
+    }
+
+    // Normalization
+    double sum = 0.;
+    for( size_t iA = 0; iA < natoms; iA++ )  sum += partitionScratch[iA];
+
+    // calculate derivative now
+    auto * weight_deri_iParent = weight_deri_ith + 3*iParent;
+    for( size_t iB = 0; iB < natoms; iB++ ) {
+      if (iB == iParent) continue;
+      auto * weight_deri_iB = weight_deri_ith + 3*iB;
+      
+      const double rAB = RAB[iB + iParent*natoms];
+      double mu_AB  = (atomDist[iParent] - atomDist[iB]) / rAB;
+      if(mu_AB > - integrator::magic_ssf_factor<> && mu_AB < safe_magic_ssf_bound){ 
+        const double uB_x = mol[iB].x - point[0];
+        const double uB_y = mol[iB].y - point[1];
+        const double uB_z = mol[iB].z - point[2];
+
+        const double uBA_x =mol[iB].x - mol[iParent].x;
+        const double uBA_y =mol[iB].y - mol[iParent].y;
+        const double uBA_z =mol[iB].z - mol[iParent].z;
+
+        // first term is - coef1 * nabla_B mu_BA
+        double coef1 = tFrisch(mu_AB) * (sum - partitionScratch[iParent])/sum;
+        weight_deri_iB[0] -= coef1 / rAB * (uB_x / atomDist[iB] + mu_AB * uBA_x /rAB);
+        weight_deri_iB[1] -= coef1 / rAB * (uB_y / atomDist[iB] + mu_AB * uBA_y /rAB);
+        weight_deri_iB[2] -= coef1 / rAB * (uB_z / atomDist[iB] + mu_AB * uBA_z /rAB);
+      }
+
+      if (std::abs(partitionScratch[iB]) < 1.e-12) continue; // no contribution to the derivative if partition function is zero
+
+      double term_x = 0.0, term_y = 0.0, term_z = 0.0;
+      for( size_t iC = 0; iC < natoms; iC++ ){
+        if (iB == iC) continue;
+        const double rBC = RAB[iC + iB*natoms];
+        double mu_BC = (atomDist[iB] - atomDist[iC]) / rBC;
+        if(mu_BC > - safe_magic_ssf_bound && mu_BC < safe_magic_ssf_bound){
+          double t_BC = tFrisch(mu_BC);
+          double coef = partitionScratch[iB] * t_BC / rBC/ sum;
+
+          term_x += coef * ((mol[iB].x - point[0]) / atomDist[iB] - mu_BC * (mol[iB].x - mol[iC].x) / rBC);
+          term_y += coef * ((mol[iB].y - point[1]) / atomDist[iB] - mu_BC * (mol[iB].y - mol[iC].y) / rBC);
+          term_z += coef * ((mol[iB].z - point[2]) / atomDist[iB] - mu_BC * (mol[iB].z - mol[iC].z) / rBC);
+
+          if(iC != iParent) {
+            auto * weight_deri_iC = weight_deri_ith + 3*iC;
+            weight_deri_iC[0] += coef * ( (mol[iC].x - point[0]) / atomDist[iC] + mu_BC * (mol[iC].x - mol[iB].x) / rBC );
+            weight_deri_iC[1] += coef * ( (mol[iC].y - point[1]) / atomDist[iC] + mu_BC * (mol[iC].y - mol[iB].y) / rBC );
+            weight_deri_iC[2] += coef * ( (mol[iC].z - point[2]) / atomDist[iC] + mu_BC * (mol[iC].z - mol[iB].z) / rBC );
+          }
+
+        }
+      }
+        weight_deri_iB[0] -= term_x;
+        weight_deri_iB[1] -= term_y;
+        weight_deri_iB[2] -= term_z;
+    }
+
+    // Use translational invariance to calculate the derivative for the parent atom
+    for( size_t iB = 0; iB < natoms; iB++ ) {
+      if (iB == iParent) continue;
+      auto * weight_deri_iB = weight_deri_ith + 3*iB;
+      weight_deri_iParent[0] -= weight_deri_iB[0];
+      weight_deri_iParent[1] -= weight_deri_iB[1];
+      weight_deri_iParent[2] -= weight_deri_iB[2];
+    }
+    
+    // Finally, scale the derivatives by the weight
+    for( size_t iB = 0; iB < natoms; iB++ ) 
+      for (size_t coord = 0; coord < 3; ++coord) 
+        weight_deri_ith[3*iB + coord] *= weight;
+
+  }
+}
+
+
+
+/**
+ * 1st derivative with contraction 
+ */
+void reference_becke_weights_1std_contraction_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  const double* w_times_f,
+  double* exc_grad_w
+){
+
+  // Becke partition functions
+  auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19
+  auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3
+  auto tBecke = [&](double x) {
+    // for numerical stability (see Jiashu's notes for details)
+    if (x > 1.0 - 1e-4) 
+      return 0.0; 
+    const double p1 = hBecke(x);
+    const double p2 = hBecke(p1);
+    return - 27.0 * (1. + p2) * (1. + p1) * (1. + x) / (1. - x) / (2. + p2) / (2. + p1) / (2. + x);
+  };
+
+  const size_t natoms = mol.natoms();
+  const auto&  RAB    = meta.rab();
+  std::vector<double> partitionScratch( natoms );
+  std::vector<double> atomDist( natoms );
+
+  for( size_t i  = 0; i  < task.points.size(); ++i  ) {
+
+    const size_t iParent = task.iParent;
+    const auto& point  = task.points[i];
+    const auto w_times_f_i = w_times_f[i];
+
+    // Compute distances of each center to point
+    for(size_t iA = 0; iA < natoms; iA++) {
+
+      const double da_x = point[0] - mol[iA].x;
+      const double da_y = point[1] - mol[iA].y;
+      const double da_z = point[2] - mol[iA].z;
+
+      atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z);
+
+    }
+
+    // Evaluate unnormalized partition functions 
+    std::fill(partitionScratch.begin(),partitionScratch.end(),1.);
+    for( size_t iA = 0; iA < natoms; iA++ ) 
+    for( size_t jA = 0; jA < iA;     jA++ ){
+
+      double mu  = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms];
+      const double g = gBecke(mu);
+
+      partitionScratch[iA] *= 0.5 * (1. - g);
+      partitionScratch[jA] *= 0.5 * (1. + g);
+    }
+
+    double sum = 0.;
+    for( size_t iA = 0; iA < natoms; iA++ )  sum += partitionScratch[iA];
+
+    // calculate derivative now
+    for( size_t iB = 0; iB < natoms; iB++ ) {
+      if (iB == iParent) continue;
+      double exc_grad_w_iBx = 0.0, exc_grad_w_iBy = 0.0, exc_grad_w_iBz = 0.0;
+      
+      const double uB_x = mol[iB].x - point[0];
+      const double uB_y = mol[iB].y - point[1];
+      const double uB_z = mol[iB].z - point[2];
+
+      const double uBA_x =mol[iB].x - mol[iParent].x;
+      const double uBA_y =mol[iB].y - mol[iParent].y;
+      const double uBA_z =mol[iB].z - mol[iParent].z;
+      const double rAB = RAB[iB + iParent*natoms];
+
+      double mu_AB  = (atomDist[iParent] - atomDist[iB]) / rAB;
+
+      // first term is - coef1 * nabla_B mu_BA
+      double coef1 = tBecke(mu_AB) * w_times_f_i;
+      exc_grad_w_iBx = - coef1 / rAB * (uB_x / atomDist[iB] + mu_AB * uBA_x /rAB);
+      exc_grad_w_iBy = - coef1 / rAB * (uB_y / atomDist[iB] + mu_AB * uBA_y /rAB);
+      exc_grad_w_iBz = - coef1 / rAB * (uB_z / atomDist[iB] + mu_AB * uBA_z /rAB);
+      
+      // second term is 1/Z *  \sum_{C != B} (P(B)t_BC - P(C)t_CB) nabla_B mu_BC
+      for( size_t iC = 0; iC < natoms; iC++ ){
+        if (iB == iC) continue;
+
+        // coef = (P(B)t_BC - P(C)t_CB)
+        double mu_BC = (atomDist[iB] - atomDist[iC]) / RAB[iC + iB*natoms];
+        double t_BC = tBecke(mu_BC);
+        double t_CB = tBecke(-mu_BC);
+        double coef = (partitionScratch[iB] *t_BC - partitionScratch[iC] * t_CB)/ sum * w_times_f_i;
+
+        const double rBC = RAB[iC + iB*natoms];
+
+        exc_grad_w_iBx -= coef * ((mol[iB].x - point[0]) / atomDist[iB] / rBC - mu_BC * (mol[iB].x - mol[iC].x) / rBC / rBC);
+        exc_grad_w_iBy -= coef * ((mol[iB].y - point[1]) / atomDist[iB] / rBC - mu_BC * (mol[iB].y - mol[iC].y) / rBC / rBC);
+        exc_grad_w_iBz -= coef * ((mol[iB].z - point[2]) / atomDist[iB] / rBC - mu_BC * (mol[iB].z - mol[iC].z) / rBC / rBC);
+      }
+
+      #pragma omp atomic
+      exc_grad_w[3*iB + 0] += exc_grad_w_iBx;
+      #pragma omp atomic
+      exc_grad_w[3*iB + 1] += exc_grad_w_iBy;
+      #pragma omp atomic
+      exc_grad_w[3*iB + 2] += exc_grad_w_iBz;
+      // Use translational invariance to calculate the derivative for the parent atom
+      #pragma omp atomic
+      exc_grad_w[3*iParent + 0] -= exc_grad_w_iBx;
+      #pragma omp atomic
+      exc_grad_w[3*iParent + 1] -= exc_grad_w_iBy;
+      #pragma omp atomic
+      exc_grad_w[3*iParent + 2] -= exc_grad_w_iBz;
+
+    }  
+  } 
+
+}
+
+
+void reference_ssf_weights_1std_contraction_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  const double* w_times_f,
+  double* exc_grad_w
+){
+
+  const double safe_magic_ssf_bound = integrator::magic_ssf_factor<> - 1.e-4;
+  const double w_times_f_thresh = 1.e-12;
+  const double weight_tol = integrator::ssf_weight_tol;
+
+  auto gFrisch = [&](double x) {
+    const double s_x  = x / integrator::magic_ssf_factor<>;
+    const double s_x2 = s_x  * s_x;
+    const double s_x3 = s_x  * s_x2;
+    const double s_x5 = s_x3 * s_x2;
+    const double s_x7 = s_x5 * s_x2;
+
+    return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.;
+  };
+
+  auto tFrisch = [&](double x) {
+    const double s_x  = x / integrator::magic_ssf_factor<>;
+    const double s_x2 = s_x  * s_x;
+    const double s_x3 = s_x  * s_x2;
+    const double numerator = (35.) * (s_x3 + (3.) * s_x2 + (3.) * s_x + (1.));
+    const double denominator = (x - integrator::magic_ssf_factor<>) * ((5.)*s_x3 + (20.)*s_x2 + (29.)*s_x + (16.));
+    return numerator / denominator ;
+  };
+
+  const size_t natoms = mol.natoms();
+  const auto&  RAB    = meta.rab();
+  std::vector<double> partitionScratch( natoms );
+  std::vector<double> atomDist( natoms );
+
+  for( size_t i  = 0; i  < task.points.size(); ++i  ) {
+    const auto& w_times_f_i = w_times_f[i];
+    if (fabs(w_times_f_i) < w_times_f_thresh) continue; // weight derivative = 0 when p_A = 0
+    const size_t iParent = task.iParent;
+    const auto& point  = task.points[i];
+
+    const auto dist_cutoff = 0.5 * (1-integrator::magic_ssf_factor<>) * task.dist_nearest;
+
+    // Compute dist to parent atom
+    {
+      const double da_x = point[0] - mol[iParent].x;
+      const double da_y = point[1] - mol[iParent].y;
+      const double da_z = point[2] - mol[iParent].z;
+
+      atomDist[iParent] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z);
+    }
+
+    if( atomDist[iParent] < dist_cutoff ) continue; // weight derivative = 0 when p_A = 1
+
+    // Compute distances of each center to point
+    for(size_t iA = 0; iA < natoms; iA++) {
+
+      if( iA == iParent ) continue;
+
+      const double da_x = point[0] - mol[iA].x;
+      const double da_y = point[1] - mol[iA].y;
+      const double da_z = point[2] - mol[iA].z;
+
+      atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z);
+
+    }
+
+    // Evaluate unnormalized partition functions 
+    std::fill(partitionScratch.begin(),partitionScratch.end(),1.);
+
+    for( size_t iA = 0; iA < natoms; iA++ ) 
+    for( size_t jA = 0; jA < iA;     jA++ )
+    if( partitionScratch[iA] > weight_tol or 
+        partitionScratch[jA] > weight_tol ) {
+
+      const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms];
+
+      if( mu <= -integrator::magic_ssf_factor<> ) {
+
+        partitionScratch[jA] = 0.;
+
+      } else if (mu >= integrator::magic_ssf_factor<>) {
+
+        partitionScratch[iA] = 0.;
+
+      } else {
+
+        double g = 0.5 * ( 1. - gFrisch(mu) );
+        partitionScratch[iA] *= g;
+        partitionScratch[jA] *= 1. - g;
+
+      }
+
+    }
+
+    double sum = 0.;
+    for( size_t iA = 0; iA < natoms; iA++ )  sum += partitionScratch[iA];
+
+    // calculate derivative now
+    for( size_t iB = 0; iB < natoms; iB++ ) {
+      if (iB == iParent) continue;
+      double exc_grad_w_iBx = 0.0, exc_grad_w_iBy = 0.0, exc_grad_w_iBz = 0.0;
+      
+      const double rAB = RAB[iB + iParent*natoms];
+      double rAB_inv = 1.0 / rAB;
+      double mu_AB  = (atomDist[iParent] - atomDist[iB]) * rAB_inv ;
+      if( fabs(mu_AB) < safe_magic_ssf_bound) {
+        const double uB_x = mol[iB].x - point[0];
+        const double uB_y = mol[iB].y - point[1];
+        const double uB_z = mol[iB].z - point[2];
+
+        const double uBA_x =mol[iB].x - mol[iParent].x;
+        const double uBA_y =mol[iB].y - mol[iParent].y;
+        const double uBA_z =mol[iB].z - mol[iParent].z;
+
+        // first term is - coef1 * nabla_B mu_BA
+        double coef1 = tFrisch(mu_AB) / rAB * (partitionScratch[iParent]-sum)/sum * w_times_f_i / atomDist[iB];
+        exc_grad_w_iBx = coef1 * (uB_x + mu_AB * uBA_x * rAB_inv * atomDist[iB]);
+        exc_grad_w_iBy = coef1 * (uB_y + mu_AB * uBA_y * rAB_inv * atomDist[iB]);
+        exc_grad_w_iBz = coef1 * (uB_z + mu_AB * uBA_z * rAB_inv * atomDist[iB]);
+      }
+
+      if (partitionScratch[iB] > weight_tol){
+        for( size_t iC = 0; iC < natoms; iC++ ){
+          if (iB == iC) continue;
+          const double rBC = RAB[iC + iB*natoms];
+          double mu_BC = (atomDist[iB] - atomDist[iC]) / rBC;
+          if(fabs(mu_BC) < safe_magic_ssf_bound){
+            double t_BC = tFrisch(mu_BC);
+            double coef = partitionScratch[iB] * t_BC / rBC/ sum * w_times_f_i;
+
+            exc_grad_w_iBx -= coef * ((mol[iB].x - point[0]) / atomDist[iB] - mu_BC * (mol[iB].x - mol[iC].x) / rBC);
+            exc_grad_w_iBy -= coef * ((mol[iB].y - point[1]) / atomDist[iB] - mu_BC * (mol[iB].y - mol[iC].y) / rBC);
+            exc_grad_w_iBz -= coef * ((mol[iB].z - point[2]) / atomDist[iB] - mu_BC * (mol[iB].z - mol[iC].z) / rBC);
+
+            if(iC != iParent) {
+              
+              double C_x = coef * ((mol[iC].x - point[0]) / atomDist[iC] + mu_BC * (mol[iC].x - mol[iB].x) / rBC);
+              double C_y = coef * ((mol[iC].y - point[1]) / atomDist[iC] + mu_BC * (mol[iC].y - mol[iB].y) / rBC);
+              double C_z = coef * ((mol[iC].z - point[2]) / atomDist[iC] + mu_BC * (mol[iC].z - mol[iB].z) / rBC);
+              // Update exc_grad_w_iC
+              #pragma omp atomic
+              exc_grad_w[3*iC + 0] += C_x;
+              #pragma omp atomic
+              exc_grad_w[3*iC + 1] += C_y;
+              #pragma omp atomic
+              exc_grad_w[3*iC + 2] += C_z;
+              // Update exc_grad_w for the parent atom
+              #pragma omp atomic
+              exc_grad_w[3*iParent + 0] -= C_x;
+              #pragma omp atomic
+              exc_grad_w[3*iParent + 1] -= C_y;
+              #pragma omp atomic
+              exc_grad_w[3*iParent + 2] -= C_z;
+            }
+
+          }
+        }
+      } 
+
+      #pragma omp atomic
+      exc_grad_w[3*iB + 0] += exc_grad_w_iBx;
+      #pragma omp atomic
+      exc_grad_w[3*iB + 1] += exc_grad_w_iBy;
+      #pragma omp atomic
+      exc_grad_w[3*iB + 2] += exc_grad_w_iBz;
+      // Use translational invariance to calculate the derivative for the parent atom
+      #pragma omp atomic
+      exc_grad_w[3*iParent + 0] -= exc_grad_w_iBx;
+      #pragma omp atomic
+      exc_grad_w[3*iParent + 1] -= exc_grad_w_iBy;
+      #pragma omp atomic
+      exc_grad_w[3*iParent + 2] -= exc_grad_w_iBz;
+
+    }
+  }
+
+}
+
+
+
 }
diff --git a/src/xc_integrator/local_work_driver/host/reference/weights.hpp b/src/xc_integrator/local_work_driver/host/reference/weights.hpp
index f2f8c46d..7b79a156 100644
--- a/src/xc_integrator/local_work_driver/host/reference/weights.hpp
+++ b/src/xc_integrator/local_work_driver/host/reference/weights.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -32,4 +36,36 @@ void reference_lko_weights_host(
   task_iterator          task_end
 );
 
+void reference_becke_weights_1st_derivative_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  double* weight_deri
+);
+
+void reference_ssf_weights_1st_derivative_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  double* weight_deri
+);
+
+// Becke weights 1st derivative contracted with integrator
+void reference_becke_weights_1std_contraction_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  const double* w_times_f,
+  double* exc_grad_w
+);
+
+// SSF weights 1st derivative contracted with integrator
+void reference_ssf_weights_1std_contraction_host(
+  const Molecule&        mol,
+  const MolMeta&         meta,
+  const XCTask& task,
+  const double* w_times_f,
+  double* exc_grad_w
+);
+
 }
diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
index c83d3800..192cfcd3 100644
--- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
+++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -52,6 +56,21 @@ namespace GauXC {
     }
   }
 
+  void ReferenceLocalHostWorkDriver::eval_weight_1st_deriv_contracted( 
+    XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, 
+    const XCTask& task, const double* w_times_f, double* exc_grad_w ) {
+    switch( weight_alg ) {
+      case XCWeightAlg::Becke:
+        reference_becke_weights_1std_contraction_host( mol, meta, task, w_times_f, exc_grad_w );
+        break;
+      case XCWeightAlg::SSF:
+        reference_ssf_weights_1std_contraction_host( mol, meta, task, w_times_f, exc_grad_w );
+        break;
+      default:
+        GAUXC_GENERIC_EXCEPTION("Weight Alg Not Supported");
+    }
+  }
+
 
   // Collocation
   void ReferenceLocalHostWorkDriver::eval_collocation( size_t npts, size_t nshells, 
@@ -1028,6 +1047,541 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb
 
 }
 
+void ReferenceLocalHostWorkDriver::eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* trho, double* A){
+	for( int32_t i = 0; i < (int32_t)npts; ++i ) 
+		A[i] = v2rho2[i] * trho[i];
+}
+
+void ReferenceLocalHostWorkDriver::eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A){
+	for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+		A[2*i] = v2rho2[3*i] * trho[2*i] + v2rho2[3*i+1] * trho[2*i+1];
+		A[2*i+1] = v2rho2[3*i+1] * trho[2*i] + v2rho2[3*i+2] * trho[2*i+1];
+	}
+}
+
+void ReferenceLocalHostWorkDriver::eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+  const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ){
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+
+    //calculate trial gamma
+    const auto tgamma = tdden_x_eval[i] * dden_x_eval[i] + tdden_y_eval[i] * dden_y_eval[i] + tdden_z_eval[i] * dden_z_eval[i];
+
+    A[i] = v2rho2[i] * trho[i] + 2 * v2rhogamma[i] * tgamma;
+
+    auto B_coef = v2rhogamma[i] * trho[i] + 2 * v2gamma2[i] * tgamma;
+
+    B[i * 3]     = 2 * B_coef * dden_x_eval[i] + 2 * vgamma[i] * tdden_x_eval[i];
+    B[i * 3 + 1] = 2 * B_coef * dden_y_eval[i] + 2 * vgamma[i] * tdden_y_eval[i];
+    B[i * 3 + 2] = 2 * B_coef * dden_z_eval[i] + 2 * vgamma[i] * tdden_z_eval[i];
+
+  }
+}
+
+
+void ReferenceLocalHostWorkDriver::eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+  const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ){
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    // convert dden_x_eval, dden_y_eval, dden_z_eval to two-spinor representation
+    const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]);
+    const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]);
+    const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]);
+    const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]);
+    const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]);
+    const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]);
+    // convert tdden_x_eval, tdden_y_eval, tdden_z_eval to two-spinor representation
+    const auto tdden_x_eval_a = 0.5 * (tdden_x_eval[2*i] + tdden_x_eval[2*i+1]);
+    const auto tdden_x_eval_b = 0.5 * (tdden_x_eval[2*i] - tdden_x_eval[2*i+1]);
+    const auto tdden_y_eval_a = 0.5 * (tdden_y_eval[2*i] + tdden_y_eval[2*i+1]);
+    const auto tdden_y_eval_b = 0.5 * (tdden_y_eval[2*i] - tdden_y_eval[2*i+1]);
+    const auto tdden_z_eval_a = 0.5 * (tdden_z_eval[2*i] + tdden_z_eval[2*i+1]);
+    const auto tdden_z_eval_b = 0.5 * (tdden_z_eval[2*i] - tdden_z_eval[2*i+1]);
+
+    //calculate trial gamma
+    const auto tgamma_aa = tdden_x_eval_a * dden_x_eval_a + tdden_y_eval_a * dden_y_eval_a + tdden_z_eval_a * dden_z_eval_a;
+    const auto tgamma_ab = tdden_x_eval_a * dden_x_eval_b + tdden_y_eval_a * dden_y_eval_b + tdden_z_eval_a * dden_z_eval_b
+                        + tdden_x_eval_b * dden_x_eval_a + tdden_y_eval_b * dden_y_eval_a + tdden_z_eval_b * dden_z_eval_a;
+    const auto tgamma_bb = tdden_x_eval_b * dden_x_eval_b + tdden_y_eval_b * dden_y_eval_b + tdden_z_eval_b * dden_z_eval_b;
+    const auto trho_a = trho[2*i];
+    const auto trho_b = trho[2*i+1];
+
+    const auto v2rho2_a_a = v2rho2[3*i];
+    const auto v2rho2_a_b = v2rho2[3*i+1];
+    const auto v2rho2_b_b = v2rho2[3*i+2];
+    const auto v2rhogamma_a_aa = v2rhogamma[6*i];
+    const auto v2rhogamma_a_ab = v2rhogamma[6*i+1];
+    const auto v2rhogamma_a_bb = v2rhogamma[6*i+2];
+    const auto v2rhogamma_b_aa = v2rhogamma[6*i+3];
+    const auto v2rhogamma_b_ab = v2rhogamma[6*i+4];
+    const auto v2rhogamma_b_bb = v2rhogamma[6*i+5];
+    const auto v2gamma2_aa_aa = v2gamma2[6*i];
+    const auto v2gamma2_aa_ab = v2gamma2[6*i+1];
+    const auto v2gamma2_aa_bb = v2gamma2[6*i+2];
+    const auto v2gamma2_ab_ab = v2gamma2[6*i+3];
+    const auto v2gamma2_ab_bb = v2gamma2[6*i+4];
+    const auto v2gamma2_bb_bb = v2gamma2[6*i+5];
+    const auto vgamma_aa = vgamma[3*i];
+    const auto vgamma_ab = vgamma[3*i+1];
+    const auto vgamma_bb = vgamma[3*i+2];
+
+    A[2 * i] = v2rho2_a_a * trho_a + 2 * v2rhogamma_a_aa * tgamma_aa + v2rhogamma_a_ab * tgamma_ab +
+             v2rho2_a_b * trho_b + 2 * v2rhogamma_a_bb * tgamma_bb;
+    A[2 * i + 1] = v2rho2_b_b * trho_b + 2 * v2rhogamma_b_bb * tgamma_bb + v2rhogamma_b_ab * tgamma_ab +
+             v2rho2_a_b * trho_a + 2 * v2rhogamma_b_aa * tgamma_aa;
+
+    auto B_coef1 = v2rhogamma_a_aa * trho_a + 2 * v2gamma2_aa_aa * tgamma_aa + v2gamma2_aa_ab * tgamma_ab +
+             v2rhogamma_b_aa * trho_b + 2 * v2gamma2_aa_bb * tgamma_bb;
+    auto B_coef2 = v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa + v2gamma2_ab_ab * tgamma_ab +
+             v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb;
+
+    B[i * 6]     = 2 * B_coef1 * dden_x_eval_a + B_coef2 * dden_x_eval_b + 2 * vgamma_aa * tdden_x_eval_a + vgamma_ab * tdden_x_eval_b;
+    B[i * 6 + 1] = 2 * B_coef1 * dden_y_eval_a + B_coef2 * dden_y_eval_b + 2 * vgamma_aa * tdden_y_eval_a + vgamma_ab * tdden_y_eval_b;
+    B[i * 6 + 2] = 2 * B_coef1 * dden_z_eval_a + B_coef2 * dden_z_eval_b + 2 * vgamma_aa * tdden_z_eval_a + vgamma_ab * tdden_z_eval_b;
+
+    B_coef1 = v2rhogamma_b_bb * trho_b + 2 * v2gamma2_bb_bb * tgamma_bb + v2gamma2_ab_bb * tgamma_ab +
+             v2rhogamma_a_bb * trho_a + 2 * v2gamma2_aa_bb * tgamma_aa;
+    B_coef2 = v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb + v2gamma2_ab_ab * tgamma_ab +
+             v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa;
+
+    B[i * 6 + 3] = 2 * B_coef1 * dden_x_eval_b + B_coef2 * dden_x_eval_a + 2 * vgamma_bb * tdden_x_eval_b + vgamma_ab * tdden_x_eval_a;
+    B[i * 6 + 4] = 2 * B_coef1 * dden_y_eval_b + B_coef2 * dden_y_eval_a + 2 * vgamma_bb * tdden_y_eval_b + vgamma_ab * tdden_y_eval_a;
+    B[i * 6 + 5] = 2 * B_coef1 * dden_z_eval_b + B_coef2 * dden_z_eval_a + 2 * vgamma_bb * tdden_z_eval_b + vgamma_ab * tdden_z_eval_a;
+  }
+}
+
+
+void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+  const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
+  const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+  const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C){
+
+    for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+      //calculate trial gamma
+      const auto tgamma = tdden_x_eval[i] * dden_x_eval[i] + tdden_y_eval[i] * dden_y_eval[i] + tdden_z_eval[i] * dden_z_eval[i];
+  
+      A[i] = v2rho2[i] * trho[i] + 2 * v2rhogamma[i] * tgamma + v2rhotau[i] * ttau[i];
+      C[i] = v2rhotau[i] * trho[i] + 2 * v2gammatau[i] * tgamma + v2tau2[i] * ttau[i];
+  
+      auto B_coef = v2rhogamma[i] * trho[i] + 2 * v2gamma2[i] * tgamma + v2gammatau[i] * ttau[i];
+  
+      B[i * 3]     = 2 * B_coef * dden_x_eval[i] + 2 * vgamma[i] * tdden_x_eval[i];
+      B[i * 3 + 1] = 2 * B_coef * dden_y_eval[i] + 2 * vgamma[i] * tdden_y_eval[i];
+      B[i * 3 + 2] = 2 * B_coef * dden_z_eval[i] + 2 * vgamma[i] * tdden_z_eval[i];
+  
+    }
+
+}
+
+
+void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, 
+  const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+  const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
+  const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+  const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C){
+
+  // Laplacian is not supported now
+  if( v2rholapl != nullptr ||  v2gammalapl != nullptr ||  v2lapltau != nullptr ||  v2lapl2 != nullptr )
+      GAUXC_GENERIC_EXCEPTION(std::string("Laplacian not supported"));
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    // convert dden_x_eval, dden_y_eval, dden_z_eval to two-spinor representation
+    const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]);
+    const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]);
+    const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]);
+    const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]);
+    const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]);
+    const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]);
+    // convert tdden_x_eval, tdden_y_eval, tdden_z_eval to two-spinor representation
+    const auto tdden_x_eval_a = 0.5 * (tdden_x_eval[2*i] + tdden_x_eval[2*i+1]);
+    const auto tdden_x_eval_b = 0.5 * (tdden_x_eval[2*i] - tdden_x_eval[2*i+1]);
+    const auto tdden_y_eval_a = 0.5 * (tdden_y_eval[2*i] + tdden_y_eval[2*i+1]);
+    const auto tdden_y_eval_b = 0.5 * (tdden_y_eval[2*i] - tdden_y_eval[2*i+1]);
+    const auto tdden_z_eval_a = 0.5 * (tdden_z_eval[2*i] + tdden_z_eval[2*i+1]);
+    const auto tdden_z_eval_b = 0.5 * (tdden_z_eval[2*i] - tdden_z_eval[2*i+1]);
+
+    //calculate trial gamma
+    const auto tgamma_aa = tdden_x_eval_a * dden_x_eval_a + tdden_y_eval_a * dden_y_eval_a + tdden_z_eval_a * dden_z_eval_a;
+    const auto tgamma_ab = tdden_x_eval_a * dden_x_eval_b + tdden_y_eval_a * dden_y_eval_b + tdden_z_eval_a * dden_z_eval_b
+                         + tdden_x_eval_b * dden_x_eval_a + tdden_y_eval_b * dden_y_eval_a + tdden_z_eval_b * dden_z_eval_a;
+    const auto tgamma_bb = tdden_x_eval_b * dden_x_eval_b + tdden_y_eval_b * dden_y_eval_b + tdden_z_eval_b * dden_z_eval_b;
+    const auto trho_a = trho[2*i];
+    const auto trho_b = trho[2*i+1];
+    const auto ttau_a = ttau[2*i];
+    const auto ttau_b = ttau[2*i+1];
+
+    const auto v2rho2_a_a = v2rho2[3*i];
+    const auto v2rho2_a_b = v2rho2[3*i+1];
+    const auto v2rho2_b_b = v2rho2[3*i+2];
+    const auto v2rhogamma_a_aa = v2rhogamma[6*i];
+    const auto v2rhogamma_a_ab = v2rhogamma[6*i+1];
+    const auto v2rhogamma_a_bb = v2rhogamma[6*i+2];
+    const auto v2rhogamma_b_aa = v2rhogamma[6*i+3];
+    const auto v2rhogamma_b_ab = v2rhogamma[6*i+4];
+    const auto v2rhogamma_b_bb = v2rhogamma[6*i+5];
+    const auto v2gamma2_aa_aa = v2gamma2[6*i];
+    const auto v2gamma2_aa_ab = v2gamma2[6*i+1];
+    const auto v2gamma2_aa_bb = v2gamma2[6*i+2];
+    const auto v2gamma2_ab_ab = v2gamma2[6*i+3];
+    const auto v2gamma2_ab_bb = v2gamma2[6*i+4];
+    const auto v2gamma2_bb_bb = v2gamma2[6*i+5];
+    const auto vgamma_aa = vgamma[3*i];
+    const auto vgamma_ab = vgamma[3*i+1];
+    const auto vgamma_bb = vgamma[3*i+2];
+    const auto v2rhotau_a_a = v2rhotau[4*i];
+    const auto v2rhotau_a_b = v2rhotau[4*i+1];
+    const auto v2rhotau_b_a = v2rhotau[4*i+2];
+    const auto v2rhotau_b_b = v2rhotau[4*i+3];
+    const auto v2tau2_a_a = v2tau2[3*i];
+    const auto v2tau2_a_b = v2tau2[3*i+1];
+    const auto v2tau2_b_b = v2tau2[3*i+2];
+    const auto v2gammatau_aa_a = v2gammatau[6*i];
+    const auto v2gammatau_aa_b = v2gammatau[6*i+1];
+    const auto v2gammatau_ab_a = v2gammatau[6*i+2];
+    const auto v2gammatau_ab_b = v2gammatau[6*i+3];
+    const auto v2gammatau_bb_a = v2gammatau[6*i+4];
+    const auto v2gammatau_bb_b = v2gammatau[6*i+5];
+
+  
+    A[2 * i] =     v2rho2_a_a * trho_a + 2 * v2rhogamma_a_aa * tgamma_aa + v2rhogamma_a_ab * tgamma_ab + v2rhotau_a_a * ttau_a
+                +  v2rho2_a_b * trho_b + 2 * v2rhogamma_a_bb * tgamma_bb + v2rhotau_a_b * ttau_b;
+    A[2 * i + 1] = v2rho2_b_b * trho_b + 2 * v2rhogamma_b_bb * tgamma_bb + v2rhogamma_b_ab * tgamma_ab + v2rhotau_b_b * ttau_b
+                +  v2rho2_a_b * trho_a + 2 * v2rhogamma_b_aa * tgamma_aa + v2rhotau_b_a * ttau_a;
+
+    C[2 * i] =     v2rhotau_a_a * trho_a + 2 * v2gammatau_aa_a * tgamma_aa + v2gammatau_ab_a * tgamma_ab + v2tau2_a_a * ttau_a
+                +  v2rhotau_b_a * trho_b + 2 * v2gammatau_bb_a * tgamma_bb + v2tau2_a_b * ttau_b;
+    C[2 * i + 1] = v2rhotau_b_b * trho_b + 2 * v2gammatau_bb_b * tgamma_bb + v2gammatau_ab_b * tgamma_ab + v2tau2_b_b * ttau_b
+                +  v2rhotau_a_b * trho_a + 2 * v2gammatau_aa_b * tgamma_aa + v2tau2_a_b * ttau_a;
+
+    auto B_coef1 = v2rhogamma_a_aa * trho_a + 2 * v2gamma2_aa_aa * tgamma_aa + v2gamma2_aa_ab * tgamma_ab + v2gammatau_aa_a * ttau_a
+                +  v2rhogamma_b_aa * trho_b + 2 * v2gamma2_aa_bb * tgamma_bb + v2gammatau_aa_b * ttau_b;
+    auto B_coef2 = v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa + v2gamma2_ab_ab * tgamma_ab + v2gammatau_ab_a * ttau_a
+                +  v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb + v2gammatau_ab_b * ttau_b;
+
+    B[i * 6]     = 2 * B_coef1 * dden_x_eval_a + B_coef2 * dden_x_eval_b + 2 * vgamma_aa * tdden_x_eval_a + vgamma_ab * tdden_x_eval_b;
+    B[i * 6 + 1] = 2 * B_coef1 * dden_y_eval_a + B_coef2 * dden_y_eval_b + 2 * vgamma_aa * tdden_y_eval_a + vgamma_ab * tdden_y_eval_b;
+    B[i * 6 + 2] = 2 * B_coef1 * dden_z_eval_a + B_coef2 * dden_z_eval_b + 2 * vgamma_aa * tdden_z_eval_a + vgamma_ab * tdden_z_eval_b;
+
+    B_coef1 = v2rhogamma_b_bb * trho_b + 2 * v2gamma2_bb_bb * tgamma_bb + v2gamma2_ab_bb * tgamma_ab + v2gammatau_bb_b * ttau_b
+            + v2rhogamma_a_bb * trho_a + 2 * v2gamma2_aa_bb * tgamma_aa + v2gammatau_bb_a * ttau_a;
+    B_coef2 = v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb + v2gamma2_ab_ab * tgamma_ab + v2gammatau_ab_b * ttau_b
+            + v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa + v2gammatau_ab_a * ttau_a;
+
+    B[i * 6 + 3] = 2 * B_coef1 * dden_x_eval_b + B_coef2 * dden_x_eval_a + 2 * vgamma_bb * tdden_x_eval_b + vgamma_ab * tdden_x_eval_a;
+    B[i * 6 + 4] = 2 * B_coef1 * dden_y_eval_b + B_coef2 * dden_y_eval_a + 2 * vgamma_bb * tdden_y_eval_b + vgamma_ab * tdden_y_eval_a;
+    B[i * 6 + 5] = 2 * B_coef1 * dden_z_eval_b + B_coef2 * dden_z_eval_a + 2 * vgamma_bb * tdden_z_eval_b + vgamma_ab * tdden_z_eval_a;
+
+  }
+}
+
+
+// Eval Z Matrix LDA VXC for two-spinors
+void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbf,
+  const double* vrho, const double* basis_eval, double* Za, size_t ldza,
+  double* Zb, size_t ldzb ) {
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+  //eq. 56 https://doi.org/10.1140/epjb/e2018-90170-1
+  GauXC::blas::scal( nbf, 0.5 * vrho[2*i], Za + i*ldza, 1 );
+  GauXC::blas::scal( nbf, 0.5 * vrho[2*i+1], Zb + i*ldzb, 1 );
+  }
+}
+
+void ReferenceLocalHostWorkDriver::eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ){
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+    B[i*3]   = 2 * vgamma[i] * dden_x_eval[i];
+    B[i*3+1] = 2 * vgamma[i] * dden_y_eval[i];
+    B[i*3+2] = 2 * vgamma[i]* dden_z_eval[i]; 
+  }
+}
+
+void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf,
+  const double* A, const double* B, const double* basis_eval,
+  const double* dbasis_x_eval, const double* dbasis_y_eval,
+  const double* dbasis_z_eval, double* Z, 
+  size_t ldz) {
+
+  if( ldz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Z, ldz);
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    const int32_t ioff = i * nbf;
+
+    auto* z_col = Z + ioff;
+    auto* bf_x_col = dbasis_x_eval + ioff;
+    auto* bf_y_col = dbasis_y_eval + ioff;
+    auto* bf_z_col = dbasis_z_eval + ioff;
+
+    GauXC::blas::scal( nbf, 0.5*A[i], z_col, 1 ); 
+
+    blas::axpy( nbf, B[i*3],   bf_x_col, 1, z_col, 1 );
+    blas::axpy( nbf, B[i*3+1], bf_y_col, 1, z_col, 1 );
+    blas::axpy( nbf, B[i*3+2], bf_z_col, 1, z_col, 1 );
+
+  }
+}
+
+
+void ReferenceLocalHostWorkDriver::eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, 
+  const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ){
+
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+    const auto gga_fact_aa = vgamma[3*i];
+    const auto gga_fact_ab = vgamma[3*i+1];
+    const auto gga_fact_bb = vgamma[3*i+2];
+
+    // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation
+    // so we need to convert them to the two spinor representation
+    const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]);
+    const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]);
+    const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]);
+    const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]);
+    const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]);
+    const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]);
+
+    B[i*6]   = 2 * gga_fact_aa * dden_x_eval_a + gga_fact_ab * dden_x_eval_b;
+    B[i*6+1] = 2 * gga_fact_aa * dden_y_eval_a + gga_fact_ab * dden_y_eval_b;
+    B[i*6+2] = 2 * gga_fact_aa * dden_z_eval_a + gga_fact_ab * dden_z_eval_b;
+    
+    B[i*6+3] = 2 * gga_fact_bb * dden_x_eval_b + gga_fact_ab * dden_x_eval_a;
+    B[i*6+4] = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a;
+    B[i*6+5] = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a;
+  }
+}
+void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf,
+  const double* A, const double* B, const double* basis_eval,
+  const double* dbasis_x_eval, const double* dbasis_y_eval,
+  const double* dbasis_z_eval, double* Za, 
+  size_t ldza, double* Zb, size_t ldzb ) {
+
+
+  if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    const int32_t ioff = i * nbf;
+
+    auto* za_col = Za + ioff;
+    auto* zb_col = Zb + ioff;
+    auto* bf_x_col = dbasis_x_eval + ioff;
+    auto* bf_y_col = dbasis_y_eval + ioff;
+    auto* bf_z_col = dbasis_z_eval + ioff;
+
+    GauXC::blas::scal( nbf, 0.5*A[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+    GauXC::blas::scal( nbf, 0.5*A[2*i+1], zb_col, 1 );
+
+    blas::axpy( nbf, B[i*6],   bf_x_col, 1, za_col, 1 );
+    blas::axpy( nbf, B[i*6+1], bf_y_col, 1, za_col, 1 );
+    blas::axpy( nbf, B[i*6+2], bf_z_col, 1, za_col, 1 );
+
+    blas::axpy( nbf, B[i*6+3], bf_x_col, 1, zb_col, 1 );
+    blas::axpy( nbf, B[i*6+4], bf_y_col, 1, zb_col, 1 );
+    blas::axpy( nbf, B[i*6+5], bf_z_col, 1, zb_col, 1 );
+
+  }
+}
+
+
+void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf,
+  const double* vrho, const double* vgamma, const double* basis_eval,
+  const double* dbasis_x_eval, const double* dbasis_y_eval,
+  const double* dbasis_z_eval, const double* dden_x_eval,
+  const double* dden_y_eval, const double* dden_z_eval, double* Za, 
+  size_t ldza, double* Zb, size_t ldzb ) {
+
+
+  if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    const int32_t ioff = i * nbf;
+
+    auto* za_col = Za + ioff;
+    auto* zb_col = Zb + ioff;
+    auto* bf_x_col = dbasis_x_eval + ioff;
+    auto* bf_y_col = dbasis_y_eval + ioff;
+    auto* bf_z_col = dbasis_z_eval + ioff;
+
+    GauXC::blas::scal( nbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+    GauXC::blas::scal( nbf, 0.5*vrho[2*i+1], zb_col, 1 );
+
+    const auto gga_fact_aa = vgamma[3*i];
+    const auto gga_fact_ab = vgamma[3*i+1];
+    const auto gga_fact_bb = vgamma[3*i+2];
+
+    // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation
+    // so we need to convert them to the two spinor representation
+    const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]);
+    const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]);
+    const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]);
+    const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]);
+    const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]);
+    const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]);
+
+    const auto x_fact_a = 2 * gga_fact_aa * dden_x_eval_a + gga_fact_ab * dden_x_eval_b;
+    const auto y_fact_a = 2 * gga_fact_aa * dden_y_eval_a + gga_fact_ab * dden_y_eval_b;
+    const auto z_fact_a = 2 * gga_fact_aa * dden_z_eval_a + gga_fact_ab * dden_z_eval_b;
+
+    const auto x_fact_b = 2 * gga_fact_bb * dden_x_eval_b + gga_fact_ab * dden_x_eval_a;
+    const auto y_fact_b = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a;
+    const auto z_fact_b = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a;
+
+    blas::axpy( nbf, x_fact_a, bf_x_col, 1, za_col, 1 );
+    blas::axpy( nbf, y_fact_a, bf_y_col, 1, za_col, 1 );
+    blas::axpy( nbf, z_fact_a, bf_z_col, 1, za_col, 1 );
+
+    blas::axpy( nbf, x_fact_b, bf_x_col, 1, zb_col, 1 );
+    blas::axpy( nbf, y_fact_b, bf_y_col, 1, zb_col, 1 );
+    blas::axpy( nbf, z_fact_b, bf_z_col, 1, zb_col, 1 );
+
+  }
+}
+
+void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbf,
+              const double* vrho, const double* vgamma, const double* vlapl, 
+        const double* basis_eval,
+              const double* dbasis_x_eval, const double* dbasis_y_eval,
+              const double* dbasis_z_eval, const double* lbasis_eval,
+        const double* dden_x_eval,
+              const double* dden_y_eval, const double* dden_z_eval, double* Za, 
+              size_t ldza, double* Zb, size_t ldzb ) {
+
+  if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza);
+  blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb);
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    const int32_t ioff = i * nbf;
+
+    auto* za_col = Za + ioff;
+    auto* zb_col = Zb + ioff;
+    auto* bf_x_col = dbasis_x_eval + ioff;
+    auto* bf_y_col = dbasis_y_eval + ioff;
+    auto* bf_z_col = dbasis_z_eval + ioff;
+    auto* lbf_col = lbasis_eval + ioff;
+
+    GauXC::blas::scal( nbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. "
+    GauXC::blas::scal( nbf, 0.5*vrho[2*i+1], zb_col, 1 );
+    
+    // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation
+    // so we need to convert them to the two spinor representation
+    const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]);
+    const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]);
+    const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]);
+    const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]);
+    const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]);
+    const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]);
+    
+    const auto gga_fact_aa = vgamma[3*i];
+    const auto gga_fact_ab = vgamma[3*i+1];
+    const auto gga_fact_bb = vgamma[3*i+2];
+
+    const auto x_fact_a = 2 * gga_fact_aa * dden_x_eval_a + gga_fact_ab * dden_x_eval_b;
+    const auto y_fact_a = 2 * gga_fact_aa * dden_y_eval_a + gga_fact_ab * dden_y_eval_b;
+    const auto z_fact_a = 2 * gga_fact_aa * dden_z_eval_a + gga_fact_ab * dden_z_eval_b;
+
+    const auto x_fact_b = 2 * gga_fact_bb * dden_x_eval_b + gga_fact_ab * dden_x_eval_a;
+    const auto y_fact_b = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a;
+    const auto z_fact_b = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a;
+
+    blas::axpy( nbf, x_fact_a, bf_x_col, 1, za_col, 1 );
+    blas::axpy( nbf, y_fact_a, bf_y_col, 1, za_col, 1 );
+    blas::axpy( nbf, z_fact_a, bf_z_col, 1, za_col, 1 );
+
+    blas::axpy( nbf, x_fact_b, bf_x_col, 1, zb_col, 1 );
+    blas::axpy( nbf, y_fact_b, bf_y_col, 1, zb_col, 1 );
+    blas::axpy( nbf, z_fact_b, bf_z_col, 1, zb_col, 1 );
+
+    if (vlapl != nullptr) {
+      blas::axpy( nbf, vlapl[2*i],     lbf_col, 1, za_col, 1);
+      blas::axpy( nbf, vlapl[2*i + 1], lbf_col, 1, zb_col, 1);
+    }
+
+  }
+}
+void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t nbf, 
+        const double* vtau, const double* vlapl, 
+        const double* dbasis_x_eval, const double* dbasis_y_eval, 
+        const double* dbasis_z_eval,
+        double* mmat_xa, double* mmat_ya, double* mmat_za, size_t ldma,
+        double* mmat_xb, double* mmat_yb, double* mmat_zb, size_t ldmb) {
+
+  if( ldma != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  if( ldmb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims"));
+  
+  blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xa, ldma);
+  blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_ya, ldma);
+  blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_za, ldma);
+  blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xb, ldmb);
+  blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_yb, ldmb);
+  blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_zb, ldmb);
+
+  for( int32_t i = 0; i < (int32_t)npts; ++i ) {
+
+    const int32_t ioff = i * nbf;
+    auto* xa_col = mmat_xa + ioff;
+    auto* ya_col = mmat_ya + ioff;
+    auto* za_col = mmat_za + ioff;
+    auto* xb_col = mmat_xb + ioff;
+    auto* yb_col = mmat_yb + ioff;
+    auto* zb_col = mmat_zb + ioff;
+    auto* bf_x_col = dbasis_x_eval + ioff;
+    auto* bf_y_col = dbasis_y_eval + ioff;
+    auto* bf_z_col = dbasis_z_eval + ioff;
+
+    const auto tfacta = 0.25 * vtau[2*i];
+    const auto tfactb = 0.25 * vtau[2*i+1];
+
+    blas::scal( nbf, tfacta, xa_col, 1);
+    blas::scal( nbf, tfacta, ya_col, 1);
+    blas::scal( nbf, tfacta, za_col, 1);
+    blas::scal( nbf, tfactb, xb_col, 1);
+    blas::scal( nbf, tfactb, yb_col, 1);
+    blas::scal( nbf, tfactb, zb_col, 1);
+
+    if ( vlapl != nullptr ) {
+      const auto lfacta = vlapl[2*i];
+      const auto lfactb = vlapl[2*i+1];
+      blas::axpy( nbf, lfacta, bf_x_col, 1, xa_col, 1);
+      blas::axpy( nbf, lfacta, bf_y_col, 1, ya_col, 1);
+      blas::axpy( nbf, lfacta, bf_z_col, 1, za_col, 1);
+      blas::axpy( nbf, lfactb, bf_x_col, 1, xb_col, 1);
+      blas::axpy( nbf, lfactb, bf_y_col, 1, yb_col, 1);
+      blas::axpy( nbf, lfactb, bf_z_col, 1, zb_col, 1);
+    }
+
+  }
+}
+
+
+
+
+
+
   // Increment VXC by Z
   void ReferenceLocalHostWorkDriver::inc_vxc( size_t npts, size_t nbf, size_t nbe, 
 					      const double* basis_eval, const submat_map_t& submat_map, const double* Z,
diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp
index 1f0d730b..3560b85b 100644
--- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp
+++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -30,6 +34,9 @@ struct ReferenceLocalHostWorkDriver : public detail::LocalHostWorkDriverPIMPL {
   void partition_weights( XCWeightAlg weight_alg, const Molecule& mol, 
     const MolMeta& meta, task_iterator task_begin, task_iterator task_end ) override;
 
+  void eval_weight_1st_deriv_contracted( XCWeightAlg weight_alg, const Molecule& mol, 
+    const MolMeta& meta, const XCTask& task, const double* w_times_f, double* exc_grad_w ) override;
+
   void eval_collocation( size_t npts, size_t nshells, size_t nbe, 
     const double* pts, const BasisSet<double>& basis, const int32_t* shell_list, 
     double* basis_eval ) override;
@@ -174,6 +181,61 @@ struct ReferenceLocalHostWorkDriver : public detail::LocalHostWorkDriverPIMPL {
     const double* basis_eval, const submat_map_t& submat_map, const double* Z, 
     size_t ldz, double* VXC, size_t ldvxc, double* scr ) override;
 
+
+  void eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* tden_eval, double* A) override;
+  void eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* tmat) override;
+  
+  void eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+    const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) override;
+  void eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, 
+    const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) override;
+  
+  void eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+    const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau,
+    const double* v2lapl2, const double* v2lapltau, const double* v2tau2, 
+    const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) override;
+  void eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, 
+    const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, 
+    const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau,
+    const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, 
+    const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) override;
+
+  void eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* basis_eval, double* Za, size_t ldza, double* Zb, size_t ldzb ) override;
+  void eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) override;
+  void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, 
+    double* Za, size_t ldza, double* Zb, size_t ldzb ) override;
+  void eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, 
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) override;
+  void eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval,
+    const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, 
+    double* Z, size_t ldz ) override;
+
+  void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* vgamma, const double* basis_eval, const double* dbasis_x_eval,
+    const double* dbasis_y_eval, const double* dbasis_z_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval,
+    double* Za, size_t ldza, double* Zb, size_t ldzb ) override;
+  void eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho,
+    const double* vgamma, const double* vlapl,
+    const double* basis_eval, const double* dbasis_x_eval, const double* dbasis_y_eval, 
+    const double* dbasis_z_eval, const double* lbasis_eval,
+    const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval,
+    double* Za, size_t ldza, double* Zb, size_t ldzb ) override;
+  void eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vtau,
+    const double* vlapl, const double* dbasis_x_eval, const double* dbasis_y_eval, 
+    const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs,
+    size_t ldms, double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz ) override;
 };
 
+
 }
diff --git a/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt
index c2a15705..00cd6536 100644
--- a/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt
+++ b/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx b/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx
index a0b3a95d..0877f2e4 100644
--- a/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx
+++ b/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp b/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp
index 021f4e81..f9a76814 100644
--- a/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp
+++ b/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/local_work_driver/host/util.hpp b/src/xc_integrator/local_work_driver/host/util.hpp
index f3de07d0..269234c2 100644
--- a/src/xc_integrator/local_work_driver/host/util.hpp
+++ b/src/xc_integrator/local_work_driver/host/util.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/CMakeLists.txt b/src/xc_integrator/replicated/CMakeLists.txt
index b8d12995..4b242ec5 100644
--- a/src/xc_integrator/replicated/CMakeLists.txt
+++ b/src/xc_integrator/replicated/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/replicated/device/CMakeLists.txt b/src/xc_integrator/replicated/device/CMakeLists.txt
index 0d789eff..9271fc73 100644
--- a/src/xc_integrator/replicated/device/CMakeLists.txt
+++ b/src/xc_integrator/replicated/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx
index c39632c0..ff64d58a 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -10,6 +14,8 @@
 #include "incore_replicated_xc_device_integrator_exc_vxc.hpp"
 #include "incore_replicated_xc_device_integrator_exc_grad.hpp"
 #include "incore_replicated_xc_device_integrator_exx.hpp"
+#include "incore_replicated_xc_device_integrator_fxc_contraction.hpp"
+#include "incore_replicated_xc_device_integrator_dd.hpp"
 
 namespace GauXC  {
 namespace detail {
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp
index 30403175..30ff47ce 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -70,13 +74,36 @@ class IncoreReplicatedXCDeviceIntegrator :
                       value_type* EXC, const IntegratorSettingsXC& settings ) override;
 
 
-  void eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                       int64_t ldp, value_type* EXC_GRAD ) override;
+  void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
+                       value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override;
+  void eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                       const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override;
 
   void eval_exx_( int64_t m, int64_t n, const value_type* P,
                   int64_t ldp, value_type* K, int64_t ldk,
                   const IntegratorSettingsEXX& settings ) override;
 
+  void eval_fxc_contraction_( int64_t m, int64_t n, 
+                              const value_type* P, int64_t ldp,   
+                              const value_type* tP, int64_t ldtp,
+                              value_type* FXC, int64_t ldfxc,
+                              const IntegratorSettingsXC& ks_settings ) override;
+
+  void eval_fxc_contraction_( int64_t m, int64_t n, 
+                              const value_type* Ps, int64_t ldps,   
+                              const value_type* Pz, int64_t ldpz,
+                              const value_type* tPs, int64_t ldtps,
+                              const value_type* tPz, int64_t ldtpz,
+                              value_type* FXCs, int64_t ldfxcs,
+                              value_type* FXCz, int64_t ldfxcz,
+                              const IntegratorSettingsXC& ks_settings ) override;
+
+  void eval_dd_psi_( int64_t m, int64_t n, const value_type* P,
+                     int64_t ldp, unsigned max_Ylm, value_type* ddPsi, 
+                     int64_t ldPsi ) override;
+  
+  void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, 
+                    unsigned max_Ylm, value_type* Vddx ) override;
 
   void integrate_den_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
                             value_type *N_EL,
@@ -102,14 +129,32 @@ class IncoreReplicatedXCDeviceIntegrator :
                             host_task_iterator task_begin, host_task_iterator task_end,
                             XCDeviceData& device_data );
 
-  void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
+  void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            host_task_iterator task_begin, host_task_iterator task_end,
+                            XCDeviceData& device_data);
+
+  void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            value_type *N_EL,
+                            value_type* FXCs, int64_t ldfxcs,
+                            value_type* FXCz, int64_t ldfxcz,
+                            host_task_iterator task_begin, host_task_iterator task_end,
+                            XCDeviceData& device_data );
+
+  void eval_exc_grad_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, 
+                                  const value_type* Pz, int64_t ldpz,
                                   host_task_iterator task_begin, host_task_iterator task_end,
-                                  XCDeviceData& device_data );
+                                  XCDeviceData& device_data, const IntegratorSettingsXC& settings );
 
-  void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P,
-                                  int64_t ldp, value_type* EXC_GRAD, 
+  void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, 
+                                  const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, 
                                   host_task_iterator task_begin, host_task_iterator task_end,
-                                  XCDeviceData& device_data );
+                                  XCDeviceData& device_data, const IntegratorSettingsXC& settings );
 
 
 
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_dd.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_dd.hpp
new file mode 100644
index 00000000..4898fa0c
--- /dev/null
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_dd.hpp
@@ -0,0 +1,35 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "incore_replicated_xc_device_integrator.hpp"
+#include <gauxc/util/misc.hpp>
+#include <gauxc/util/unused.hpp>
+
+namespace GauXC::detail {
+
+  template <typename ValueType>
+  void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+    eval_dd_psi_( int64_t m, int64_t n, const value_type* P,
+                  int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) {
+      GAUXC_GENERIC_EXCEPTION("Device DD-PSI NYI");
+      util::unused(m,n,P,ldp,max_Ylm,ddPsi,ldPsi);
+  }
+  
+  template <typename ValueType>
+  void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+    eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X,
+                   unsigned max_Ylm, value_type* Vddx ) {
+      GAUXC_GENERIC_EXCEPTION("Device DD-PHIX NYI");
+      util::unused(m,n,X,max_Ylm,Vddx);
+  }
+
+}
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp
index a8c32da3..9a2a7cf4 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp
index a49eee7f..6c030bc2 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -17,7 +21,7 @@ namespace detail {
 template <typename ValueType>
 void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                 int64_t ldp, value_type* EXC_GRAD ) { 
+                 int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& settings) { 
                  
   const auto& basis = this->load_balancer_->basis();
 
@@ -49,8 +53,65 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     // Compute local contributions to EXC Gradient and retrieve
     // data from device 
     this->timer_.time_op("XCIntegrator.LocalWork", [&](){
-      eval_exc_grad_local_work_( basis, P, ldp, EXC_GRAD, tasks.begin(),
-        tasks.end(), *device_data_ptr );
+      eval_exc_grad_local_work_( basis, P, ldp, nullptr, 0, EXC_GRAD, tasks.begin(),
+        tasks.end(), *device_data_ptr, settings );
+    });
+
+    GAUXC_MPI_CODE(
+    this->timer_.time_op("XCIntegrator.ImbalanceWait",[&](){
+      MPI_Barrier(this->load_balancer_->runtime().comm());
+    });  
+    )
+
+    this->timer_.time_op("XCIntegrator.Allreduce", [&](){
+      this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, 
+        ReductionOp::Sum );
+    });
+
+  }
+
+}
+
+
+template <typename ValueType>
+void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+  eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                  const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { 
+                 
+  const auto& basis = this->load_balancer_->basis();
+
+  // Check that P is sane
+  const int64_t nbf = basis.nbf();
+  if( m != n ) 
+    GAUXC_GENERIC_EXCEPTION("P Must Be Square");
+  if( m != nbf ) 
+    GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis");
+  if( ldps < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDPS");
+  if( ldpz < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDPZ");
+
+  // Get Tasks
+  auto& tasks = this->load_balancer_->get_tasks();
+
+  // Allocate Device memory
+  auto* lwd = dynamic_cast<LocalDeviceWorkDriver*>(this->local_work_driver_.get() );
+  auto rt  = detail::as_device_runtime(this->load_balancer_->runtime());
+  auto device_data_ptr = 
+    this->timer_.time_op("XCIntegrator.DeviceAlloc",
+      [&](){ return lwd->create_device_data(rt); });
+
+  const auto& mol = this->load_balancer_->molecule();
+  const auto natoms = mol.size();
+  if( this->reduction_driver_->takes_device_memory() ) {
+    GAUXC_GENERIC_EXCEPTION("Device Reduction + EXC Grad NYI");
+  } else {
+
+    // Compute local contributions to EXC Gradient and retrieve
+    // data from device 
+    this->timer_.time_op("XCIntegrator.LocalWork", [&](){
+      eval_exc_grad_local_work_( basis, Ps, ldps, Pz, ldpz, EXC_GRAD, tasks.begin(),
+        tasks.end(), *device_data_ptr, settings );
     });
 
     GAUXC_MPI_CODE(
@@ -71,15 +132,25 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 template <typename ValueType>
 void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   eval_exc_grad_local_work_( const basis_type& basis, 
-    const value_type* P, int64_t ldp,
+    const value_type* Ps, int64_t ldps,
+    const value_type* Pz, int64_t ldpz,
     host_task_iterator task_begin, host_task_iterator task_end,
-    XCDeviceData& device_data ) {
+    XCDeviceData& device_data, const IntegratorSettingsXC& settings ) {
+
+  const bool is_uks = Pz != nullptr;
+  const bool is_rks = not is_uks;
 
   auto* lwd = dynamic_cast<LocalDeviceWorkDriver*>(this->local_work_driver_.get() );
 
   // Setup Aliases
   const auto& func  = *this->func_;
   const auto& mol   = this->load_balancer_->molecule();
+  const auto& meta  = this->load_balancer_->molmeta();
+
+  // Sanity gates
+  if(func.needs_laplacian()) {
+    GAUXC_GENERIC_EXCEPTION("Device EXC Gradients + Laplacian Dependent MGGAs Not Yet Implemented");
+  }
 
   // Get basis map
   BasisSetMap basis_map(basis,mol);
@@ -93,33 +164,47 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   };
   std::sort( task_begin, task_end, task_comparator );
 
-
-
+  // Misc KS settings
+  IntegratorSettingsEXC_GRAD exc_grad_settings;
+  if( auto* tmp = dynamic_cast<const IntegratorSettingsEXC_GRAD*>(&settings) ) {
+    exc_grad_settings = *tmp;
+  }
 
   // Check that Partition Weights have been calculated
   auto& lb_state = this->load_balancer_->state();
   if( not lb_state.modified_weights_are_stored ) {
     GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
   }
+  XCWeightAlg& weight_alg = lb_state.weight_alg;
+
+
+  // Processes batches in groups that saturadate available device memory
+  integrator_term_tracker enabled_terms;
+  enabled_terms.exc_grad = true;
+  enabled_terms.weights  = true;
+
+  if (is_rks) enabled_terms.ks_scheme = RKS;
+  else if (is_uks) enabled_terms.ks_scheme = UKS;
+
+  if( func.is_lda() )      enabled_terms.xc_approx = integrator_xc_approx::LDA; 
+  else if( func.is_gga() ) enabled_terms.xc_approx = integrator_xc_approx::GGA; 
+  else if( func.needs_laplacian() ) enabled_terms.xc_approx = integrator_xc_approx::MGGA_LAPL;
+  else enabled_terms.xc_approx = integrator_xc_approx::MGGA_TAU;
 
   // Do XC integration in task batches
   const auto nbf     = basis.nbf();
   const auto nshells = basis.nshells();
   const auto natoms  = mol.size();
   device_data.reset_allocations();
-  device_data.allocate_static_data_exc_grad( nbf, nshells, natoms );
-  device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis );
+  device_data.allocate_static_data_exc_grad( nbf, nshells, natoms, enabled_terms );
+  device_data.send_static_data_density_basis( Ps, ldps, Pz, ldpz, nullptr, 0, nullptr, 0, basis );
+  // for weight contribution
+  device_data.allocate_static_data_weights( natoms );
+  device_data.send_static_data_weights( mol, meta );
 
   // Zero integrands
   device_data.zero_exc_grad_integrands();
 
-  // Processes batches in groups that saturadate available device memory
-  integrator_term_tracker enabled_terms;
-  enabled_terms.exc_grad = true;
-  enabled_terms.ks_scheme = RKS;
-  if( func.is_lda() )      enabled_terms.xc_approx = integrator_xc_approx::LDA; 
-  else if( func.is_gga() ) enabled_terms.xc_approx = integrator_xc_approx::GGA; 
-  else GAUXC_GENERIC_EXCEPTION("XC Approx NYI");
 
   auto task_it = task_begin;
   while( task_it != task_end ) {
@@ -131,30 +216,51 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     /*** Process the batches ***/
 
     // Evaluate collocation
-    if( func.is_gga() ) lwd->eval_collocation_hessian ( &device_data );
-    else                lwd->eval_collocation_gradient( &device_data );
-
-    // Evaluate X matrix
-    const bool do_xmat_grad = func.is_gga();
-    lwd->eval_xmat( 2.0, &device_data, do_xmat_grad, DEN_S );
-    
-    // Evaluate V variable
-    lwd->eval_vvar( &device_data, DEN_S, do_xmat_grad );
+    if( func.needs_laplacian() ) lwd->eval_collocation_lapgrad ( &device_data );
+    else if( !func.is_lda() )    lwd->eval_collocation_hessian ( &device_data );
+    else                         lwd->eval_collocation_gradient( &device_data );
+
+    // Evaluate X matrix and V vars
+    const auto xmat_fac = is_rks ? 2.0 : 1.0;
+    const auto need_lapl = func.needs_laplacian();
+    const auto need_xmat_grad = not func.is_lda();
+    auto do_xmat_vvar = [&](density_id den_id) {
+      lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id );
+      if(func.is_lda())      lwd->eval_vvars_lda( &device_data, den_id );
+      else if(func.is_gga()) lwd->eval_vvars_gga( &device_data, den_id ); 
+      else                   lwd->eval_vvars_mgga( &device_data, den_id, need_lapl );
+
+      // Save XMat for EXC gradient assembly
+      if(is_uks) lwd->save_xmat( &device_data, need_xmat_grad, den_id );
+    };
+
+    do_xmat_vvar(DEN_S);
+    if (not is_rks) {
+      do_xmat_vvar(DEN_Z);
+    }
 
     // Evaluate U variables
-    if( func.is_gga() ) lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme );
-    else                lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme );
+    if( func.is_mgga() )     lwd->eval_uvars_mgga( &device_data, enabled_terms.ks_scheme, need_lapl );
+    else if( func.is_gga() ) lwd->eval_uvars_gga ( &device_data, enabled_terms.ks_scheme );
+    else                     lwd->eval_uvars_lda ( &device_data, enabled_terms.ks_scheme );
 
     // Evaluate XC functional (we need VXC for EXC Gradient)
-    if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data );
-    else                lwd->eval_kern_exc_vxc_lda( func, &device_data );
+    if( func.is_mgga() )     lwd->eval_kern_exc_vxc_mgga( func, &device_data );
+    else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga ( func, &device_data );
+    else                     lwd->eval_kern_exc_vxc_lda ( func, &device_data );
 
-    // Do scalar N_EL integration
+
+    // Do scalar N_EL integration    
     lwd->inc_nel( &device_data );
 
     // Increment EXC Gradient
-    if( func.is_gga() ) lwd->inc_exc_grad_gga( &device_data );
-    else                lwd->inc_exc_grad_lda( &device_data );
+    if( func.is_mgga() )     lwd->inc_exc_grad_mgga( &device_data, enabled_terms.ks_scheme, need_lapl, exc_grad_settings.include_weight_derivatives );
+    else if( func.is_gga() ) lwd->inc_exc_grad_gga ( &device_data, enabled_terms.ks_scheme, exc_grad_settings.include_weight_derivatives );
+    else                     lwd->inc_exc_grad_lda ( &device_data, enabled_terms.ks_scheme, exc_grad_settings.include_weight_derivatives );
+
+    // weight contribution
+    if(exc_grad_settings.include_weight_derivatives)
+      lwd->eval_weight_1st_deriv_contracted( &device_data, weight_alg );
 
   } // Loop over batches of batches 
 
@@ -163,12 +269,14 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 template <typename ValueType>
 void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   eval_exc_grad_local_work_( const basis_type& basis, 
-    const value_type* P, int64_t ldp, value_type* EXC_GRAD, 
+    const value_type* Ps, int64_t ldps, 
+    const value_type* Pz, int64_t ldpz, 
+    value_type* EXC_GRAD, 
     host_task_iterator task_begin, host_task_iterator task_end,
-    XCDeviceData& device_data ) {
+    XCDeviceData& device_data, const IntegratorSettingsXC& settings ) {
 
   // Compute XC gradient and keep data on the device
-  eval_exc_grad_local_work_( basis, P, ldp, task_begin, task_end, device_data );
+  eval_exc_grad_local_work_( basis, Ps, ldps, Pz, ldpz, task_begin, task_end, device_data, settings );
 
   // Receive XC gradient from host
   double N_EL;
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp
index c8cae61d..416a49c5 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -237,7 +241,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
   const auto& func  = *this->func_;
   const auto& mol   = this->load_balancer_->molecule();
 
-  if( func.is_mgga() and (is_uks or is_gks) ) GAUXC_GENERIC_EXCEPTION("Device + Polarized mGGAs NYI!");
+  if( func.is_mgga() and is_gks ) GAUXC_GENERIC_EXCEPTION("GKS mGGAs NYI!");
 
   // Get basis map
   BasisSetMap basis_map(basis,mol);
@@ -310,12 +314,13 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
       
     const double xmat_fac = is_rks ? 2.0 : 1.0;
     const bool need_xmat_grad = func.is_mgga();
-    const bool need_vvar_grad = func.is_mgga() or func.is_gga();
 
     // Evaluate X matrix and V vars
     auto do_xmat_vvar = [&](density_id den_id) {
       lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id );
-      lwd->eval_vvar( &device_data, den_id, need_vvar_grad );
+      if(func.is_lda())      lwd->eval_vvars_lda( &device_data, den_id );
+      else if(func.is_gga()) lwd->eval_vvars_gga( &device_data, den_id ); 
+      else                   lwd->eval_vvars_mgga( &device_data, den_id, need_lapl );
     };
 
     do_xmat_vvar(DEN_S);
@@ -329,25 +334,25 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 
 
     // Evaluate U variables
-    if( func.is_mgga() )      lwd->eval_uvars_mgga( &device_data, need_lapl );     //<<< TODO: Fn call is different because MGGA U/GKS NYI
-    else if( func.is_gga() )  lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme );
-    else                      lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme );
+    if( func.is_mgga() )      lwd->eval_uvars_mgga( &device_data, enabled_terms.ks_scheme, need_lapl );
+    else if( func.is_gga() )  lwd->eval_uvars_gga ( &device_data, enabled_terms.ks_scheme );
+    else                      lwd->eval_uvars_lda ( &device_data, enabled_terms.ks_scheme );
 
     // Evaluate XC functional
     if( func.is_mgga() )     lwd->eval_kern_exc_vxc_mgga( func, &device_data );
-    else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data );
-    else                     lwd->eval_kern_exc_vxc_lda( func, &device_data );
+    else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga ( func, &device_data );
+    else                     lwd->eval_kern_exc_vxc_lda ( func, &device_data );
     
 
     // Do scalar EXC/N_EL integrations
     lwd->inc_exc( &device_data );
     lwd->inc_nel( &device_data );
-    if( not do_vxc ) continue;
+    if( not do_vxc) continue;
 
    auto do_zmat_vxc = [&](density_id den_id) {
      if( func.is_mgga() ) {
-       lwd->eval_zmat_mgga_vxc( &device_data, need_lapl);
-       lwd->eval_mmat_mgga_vxc( &device_data, need_lapl);
+       lwd->eval_zmat_mgga_vxc( &device_data, enabled_terms.ks_scheme, need_lapl, den_id);
+       lwd->eval_mmat_mgga_vxc( &device_data, enabled_terms.ks_scheme, need_lapl, den_id);
      }
      else if( func.is_gga() ) 
        lwd->eval_zmat_gga_vxc( &device_data, enabled_terms.ks_scheme, den_id );
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp
index 1aec0077..c19f5d5b 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -130,8 +134,6 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 
 
   // Get basis map and shell pairs
-  //BasisSetMap basis_map(basis,mol);
-  //ShellPairCollection shell_pairs(basis);
   auto& basis_map   = this->load_balancer_->basis_map();
   auto& shell_pairs = this->load_balancer_->shell_pairs();
 
@@ -257,8 +259,8 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
     GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
   }
 
-    task_end = std::stable_partition( task_begin, task_end,
-      []( const auto& t ) { return t.cou_screening.shell_list.size() > 0; } );
+  task_end = std::stable_partition( task_begin, task_end,
+    []( const auto& t ) { return t.cou_screening.shell_list.size() > 0; } );
 
 #if 0
   // Lexicographic ordering of tasks
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp
new file mode 100644
index 00000000..ffc0ca41
--- /dev/null
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp
@@ -0,0 +1,343 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "incore_replicated_xc_device_integrator.hpp"
+#include <gauxc/util/misc.hpp>
+#include <gauxc/util/unused.hpp>
+
+namespace GauXC::detail {
+
+  template <typename ValueType>
+  void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+    eval_fxc_contraction_( int64_t m, int64_t n, 
+                          const value_type* P, int64_t ldp,
+                          const value_type* tP, int64_t ldtp,
+                          value_type* FXC, int64_t ldfxc,
+                          const IntegratorSettingsXC& ks_settings ) {
+    
+    eval_fxc_contraction_( m, n, P, ldp, nullptr, 0, tP, ldtp, nullptr, 0,
+                          FXC, ldfxc, nullptr, 0, ks_settings );
+  }
+
+    
+  template <typename ValueType>
+  void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+    eval_fxc_contraction_( int64_t m, int64_t n, 
+                          const value_type* Ps, int64_t ldps,
+                          const value_type* Pz, int64_t ldpz,
+                          const value_type* tPs, int64_t ldtps,
+                          const value_type* tPz, int64_t ldtpz,
+                          value_type* FXCs, int64_t ldfxcs,
+                          value_type* FXCz, int64_t ldfxcz,
+                          const IntegratorSettingsXC& ks_settings ) {
+    const bool is_uks = (Pz != nullptr);
+    const bool is_rks = !is_uks;
+
+    const auto& basis = this->load_balancer_->basis();
+
+    // Check that P / FXC are sane
+    const int64_t nbf = basis.nbf();
+    if( m != n ) 
+      GAUXC_GENERIC_EXCEPTION("P/FXC Must Be Square");
+    if( m != nbf ) 
+      GAUXC_GENERIC_EXCEPTION("P/FXC Must Have Same Dimension as Basis");
+    if( ldps < nbf )
+      GAUXC_GENERIC_EXCEPTION("Invalid LDPs");
+    if( ldtps < nbf )
+      GAUXC_GENERIC_EXCEPTION("Invalid LDTps");
+    if( ldfxcs < nbf )
+      GAUXC_GENERIC_EXCEPTION("Invalid LDFXCs");
+
+    if( not is_rks ) {
+      if( ldpz < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDPz");
+      if( ldtpz < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDTpz");
+      if( ldfxcz < nbf )
+        GAUXC_GENERIC_EXCEPTION("Invalid LDFXCz");
+    }
+
+    // Get Tasks
+    auto& tasks = this->load_balancer_->get_tasks();
+
+    // Allocate Device memory
+    auto* lwd = dynamic_cast<LocalDeviceWorkDriver*>(this->local_work_driver_.get() );
+    auto rt  = detail::as_device_runtime(this->load_balancer_->runtime());
+    auto device_data_ptr = lwd->create_device_data(rt);
+
+    GAUXC_MPI_CODE( MPI_Barrier(rt.comm());) 
+
+    // Temporary electron count to judge integrator accuracy
+    value_type N_EL;
+  
+    if( this->reduction_driver_->takes_device_memory() ) {
+
+      // If we can do reductions on the device (e.g. NCCL)
+      // Don't communicate data back to the host before reduction
+      this->timer_.time_op("XCIntegrator.LocalWork_FXC", [&](){
+        fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, tPs, ldtps, tPz, ldtpz,
+          tasks.begin(), tasks.end(), *device_data_ptr);
+      });
+
+      GAUXC_MPI_CODE(
+      this->timer_.time_op("XCIntegrator.ImbalanceWait_FXC",[&](){
+        MPI_Barrier(this->load_balancer_->runtime().comm());
+      });  
+      )
+
+      // Reduce results in device memory
+      double* fxc_s_device = device_data_ptr->fxc_s_device_data();
+      double* fxc_z_device;
+      auto nel_device = device_data_ptr->nel_device_data();
+      auto queue = device_data_ptr->queue();
+      
+      if( not is_rks ) {
+        fxc_z_device = device_data_ptr->fxc_z_device_data();
+        // UKS
+        this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){
+          this->reduction_driver_->allreduce_inplace( fxc_s_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( fxc_z_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( nel_device, 1,       ReductionOp::Sum, queue );
+        });
+      } else {
+        // RKS
+        this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){
+          this->reduction_driver_->allreduce_inplace( fxc_s_device, nbf*nbf, ReductionOp::Sum, queue );
+          this->reduction_driver_->allreduce_inplace( nel_device, 1,       ReductionOp::Sum, queue );
+        });
+      }
+
+      // Retrieve data to host
+      this->timer_.time_op("XCIntegrator.DeviceToHostCopy_FXC",[&](){
+        device_data_ptr->retrieve_fxc_contraction_integrands(&N_EL, FXCs, ldfxcs, FXCz, ldfxcz, nullptr, 0, nullptr, 0);
+      });
+
+    } else {
+
+      // Compute local contributions to FXC and retrieve
+      // data from device 
+      this->timer_.time_op("XCIntegrator.LocalWork_FXC", [&](){
+        fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, tPs, ldtps, tPz, ldtpz, &N_EL, 
+                              FXCs, ldfxcs, FXCz, ldfxcz, tasks.begin(), tasks.end(), *device_data_ptr);
+      });
+
+      GAUXC_MPI_CODE(
+      this->timer_.time_op("XCIntegrator.ImbalanceWait_FXC",[&](){
+        MPI_Barrier(this->load_balancer_->runtime().comm());
+      });  
+      )
+
+      // Reduce Results in host mem
+      if( is_rks ) {
+        this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){
+          this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum );
+        this->reduction_driver_->allreduce_inplace( &N_EL, 1,       ReductionOp::Sum );
+        });
+      } else {
+        // UKS
+        this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){
+          this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( FXCz, nbf*nbf, ReductionOp::Sum );
+          this->reduction_driver_->allreduce_inplace( &N_EL, 1,       ReductionOp::Sum );
+        });
+      }
+    }
+  }
+
+  template <typename ValueType>
+  void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+    fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            host_task_iterator task_begin, host_task_iterator task_end,
+                            XCDeviceData& device_data) {
+    const bool is_uks = (Pz != nullptr);
+    const bool is_rks = !is_uks;
+    if (not is_rks and not is_uks) {
+      GAUXC_GENERIC_EXCEPTION("MUST BE EITHER RKS OR UKS!");
+    }
+    
+
+    // Cast LWD to LocalDeviceWorkDriver
+    auto* lwd = dynamic_cast<LocalDeviceWorkDriver*>(this->local_work_driver_.get() );
+
+    // Setup Aliases
+    const auto& func  = *this->func_;
+    const auto& mol   = this->load_balancer_->molecule();
+
+    // Get basis map
+    BasisSetMap basis_map(basis,mol);
+
+    // Populate submat maps
+    device_data.populate_submat_maps( basis.nbf(), task_begin, task_end, basis_map );
+
+
+    // Sort tasks 
+    auto task_comparator = []( const XCTask& a, const XCTask& b ) {
+      return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
+    };
+    std::sort( task_begin, task_end, task_comparator );
+
+
+    // Check that Partition Weights have been calculated
+    auto& lb_state = this->load_balancer_->state();
+    if( not lb_state.modified_weights_are_stored ) {
+      GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
+    }
+    
+
+    integrator_term_tracker enabled_terms;
+    enabled_terms.fxc_contraction = true;
+
+    if (is_rks) enabled_terms.ks_scheme = RKS;
+    else if (is_uks) enabled_terms.ks_scheme = UKS;
+
+    if( func.is_lda() )      
+      enabled_terms.xc_approx = integrator_xc_approx::LDA; 
+    else if( func.is_gga() ) 
+      enabled_terms.xc_approx = integrator_xc_approx::GGA; 
+    else if( func.needs_laplacian() )                    
+      GAUXC_GENERIC_EXCEPTION("FXC contraction does not support MGGA with Laplacian");
+    else
+      enabled_terms.xc_approx = integrator_xc_approx::MGGA_TAU;
+    
+    // Do XC integration in task batches
+    const auto nbf     = basis.nbf();
+    const auto nshells = basis.nshells();
+    device_data.reset_allocations();
+    device_data.allocate_static_data_fxc_contraction( nbf, nshells, enabled_terms);
+    
+    device_data.send_static_data_density_basis( Ps, ldps, Pz, ldpz, nullptr, 0, nullptr, 0, basis );
+    device_data.send_static_data_trial_density( tPs, ldtps, tPz, ldtpz, nullptr, 0, nullptr, 0 );
+
+
+    // Zero integrands
+    device_data.zero_fxc_contraction_integrands();
+
+
+    auto task_it = task_begin;
+    while( task_it != task_end ) {
+
+      // Determine next task batch, send relevant data to device (FXC only)
+      task_it = 
+        device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end );
+
+      /*** Process the batches ***/
+      
+      const bool need_lapl = func.needs_laplacian();
+      // Evaluate collocation
+      if( func.is_mgga() ) {
+        if(need_lapl) lwd->eval_collocation_laplacian( &device_data );
+        else          lwd->eval_collocation_gradient( &device_data );
+      }
+      else if( func.is_gga() ) lwd->eval_collocation_gradient( &device_data );
+      else                     lwd->eval_collocation( &device_data );
+        
+      const double xmat_fac = is_rks ? 2.0 : 1.0;
+      const bool need_xmat_grad = func.is_mgga();
+
+      // Evaluate X matrix and V vars
+      auto do_xmat_vvar = [&](density_id den_id) {
+        lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id );
+        if(func.is_lda())      lwd->eval_vvars_lda( &device_data, den_id );
+        else if(func.is_gga()) lwd->eval_vvars_gga( &device_data, den_id ); 
+        else                   lwd->eval_vvars_mgga( &device_data, den_id, need_lapl );
+      };
+
+      do_xmat_vvar(DEN_S);
+      if (not is_rks) {
+        do_xmat_vvar(DEN_Z);
+      }
+
+      // Evaluate U variables
+      if( func.is_mgga() )      lwd->eval_uvars_mgga( &device_data, enabled_terms.ks_scheme, need_lapl );
+      else if( func.is_gga() )  lwd->eval_uvars_gga ( &device_data, enabled_terms.ks_scheme );
+      else                      lwd->eval_uvars_lda ( &device_data, enabled_terms.ks_scheme );
+
+      // Evaluate XC functional
+      if( func.is_mgga() )     lwd->eval_kern_vxc_fxc_mgga( func, &device_data );
+      else if( func.is_gga() ) lwd->eval_kern_vxc_fxc_gga ( func, &device_data );
+      else                     lwd->eval_kern_vxc_fxc_lda ( func, &device_data );      
+
+      // Do scalar N_EL integrations
+      lwd->inc_nel( &device_data );
+
+      
+      // Evaluate X matrix and V vars from trial density
+      auto do_xmat_vvar_trial = [&](density_id den_id) {
+        lwd->eval_xmat_trial( xmat_fac, &device_data, need_xmat_grad, den_id );
+        if(func.is_lda())      lwd->eval_vvars_lda_trial( &device_data, den_id );
+        else if(func.is_gga()) lwd->eval_vvars_gga_trial( &device_data, den_id ); 
+        else                   lwd->eval_vvars_mgga_trial( &device_data, den_id, need_lapl );
+      };
+
+      do_xmat_vvar_trial(DEN_S);
+      if (not is_rks) {
+        do_xmat_vvar_trial(DEN_Z);
+      }
+
+      // Evaluate tmat (it contains the trial u variable evaluation inside)
+      if( func.is_mgga() )      lwd->eval_tmat_mgga( &device_data, enabled_terms.ks_scheme, need_lapl );
+      else if( func.is_gga() )  lwd->eval_tmat_gga ( &device_data, enabled_terms.ks_scheme );
+      else                      lwd->eval_tmat_lda ( &device_data, enabled_terms.ks_scheme );
+
+      auto do_zmat_fxc = [&](density_id den_id) {
+        if( func.is_mgga() ) {
+          lwd->eval_zmat_mgga_fxc( &device_data, need_lapl, den_id);
+          lwd->eval_mmat_mgga_fxc( &device_data, need_lapl, den_id);
+        }
+        else if( func.is_gga() ) 
+          lwd->eval_zmat_gga_fxc( &device_data, den_id );
+        else 
+          lwd->eval_zmat_lda_fxc( &device_data, den_id );
+        lwd->inc_fxc( &device_data, den_id, func.is_mgga() );
+      };
+
+      do_zmat_fxc(DEN_S);
+      if(not is_rks) {
+        do_zmat_fxc(DEN_Z);
+      } 
+
+    } // Loop over batches of batches 
+
+    // Symmetrize FXC in device memory
+    lwd->symmetrize_fxc( &device_data, DEN_S );
+    if (not is_rks) {
+      lwd->symmetrize_fxc( &device_data, DEN_Z );
+      }
+  }
+
+  template <typename ValueType>
+  void IncoreReplicatedXCDeviceIntegrator<ValueType>::
+    fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            value_type *N_EL,
+                            value_type* FXCs, int64_t ldfxcs,
+                            value_type* FXCz, int64_t ldfxcz,
+                            host_task_iterator task_begin, host_task_iterator task_end,
+                            XCDeviceData& device_data ) {
+    
+    // Get integrate and keep data on device
+    fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, tPs, ldtps, tPz, ldtpz, 
+                              task_begin, task_end, device_data);
+    auto rt  = detail::as_device_runtime(this->load_balancer_->runtime());
+    rt.device_backend()->master_queue_synchronize();
+
+    // Receive FXC terms from host
+    this->timer_.time_op("XCIntegrator.DeviceToHostCopy_FXC",[&](){
+      device_data.retrieve_fxc_contraction_integrands( N_EL, FXCs, ldfxcs, FXCz, ldfxcz, nullptr, 0, nullptr, 0 ); 
+    });
+  }
+}
diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp
index 764bf3c0..d7f224c1 100644
--- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp
+++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -130,7 +134,7 @@ void IncoreReplicatedXCDeviceIntegrator<ValueType>::
 
     // Evaluate the density
     const bool do_vvar_grad = false;
-    lwd->eval_vvar( &device_data, DEN_S, do_vvar_grad );
+    lwd->eval_vvars_lda( &device_data, DEN_S );
 
     // Do scalar N_EL integration
     lwd->inc_nel( &device_data );
diff --git a/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx b/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx
index 605de1c6..082b74ea 100644
--- a/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx
+++ b/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx
index 1ff70ee9..febcd7aa 100644
--- a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx
+++ b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -11,6 +15,9 @@
 #include "shell_batched_replicated_xc_integrator_exc_vxc.hpp"
 #include "shell_batched_replicated_xc_integrator_exc_grad.hpp"
 #include "shell_batched_replicated_xc_integrator_exx.hpp"
+#include "shell_batched_replicated_xc_integrator_fxc_contraction.hpp"
+#include "shell_batched_replicated_xc_integrator_dd_psi.hpp"
+#include "shell_batched_replicated_xc_integrator_dd_psi_potential.hpp"
 
 namespace GauXC  {
 namespace detail {
diff --git a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp
index dd165f64..38c8efdc 100644
--- a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp
+++ b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/CMakeLists.txt b/src/xc_integrator/replicated/host/CMakeLists.txt
index ae47dc6d..2b878b68 100644
--- a/src/xc_integrator/replicated/host/CMakeLists.txt
+++ b/src/xc_integrator/replicated/host/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx
index 731eaf84..6695d912 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -10,7 +14,10 @@
 #include "reference_replicated_xc_host_integrator_exc_vxc.hpp"
 #include "reference_replicated_xc_host_integrator_exc_grad.hpp"
 #include "reference_replicated_xc_host_integrator_exx.hpp"
- 
+#include "reference_replicated_xc_host_integrator_fxc_contraction.hpp"
+#include "reference_replicated_xc_host_integrator_dd_psi.hpp"
+#include "reference_replicated_xc_host_integrator_dd_psi_potential.hpp"
+
 namespace GauXC::detail {
 
 template <typename ValueType>
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp
index bf5d4d61..a32748eb 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -72,15 +76,40 @@ class ReferenceReplicatedXCHostIntegrator :
 
 
   /// RKS EXC Gradient
-  void eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                       int64_t ldp, value_type* EXC_GRAD ) override;
+  void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
+                       value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override;
+  /// UKS EXC Gradient
+  void eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                       const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override;
 
   /// sn-LinK
   void eval_exx_( int64_t m, int64_t n, const value_type* P,
                   int64_t ldp, value_type* K, int64_t ldk,
                   const IntegratorSettingsEXX& settings ) override;
 
-
+  /// RKS FXC contraction
+  void eval_fxc_contraction_( int64_t m, int64_t n, 
+                    const value_type* P, int64_t ldp, 
+                    const value_type* tP, int64_t ldtp,
+                    value_type* FXC, int64_t ldfxc,
+                    const IntegratorSettingsXC& ks_settings ) override;
+
+  // UKS FXC contraction
+  void eval_fxc_contraction_( int64_t m, int64_t n, 
+                    const value_type* Ps, int64_t ldps,   
+                    const value_type* Pz, int64_t ldpz,
+                    const value_type* tPs, int64_t ldtps,
+                    const value_type* tPz, int64_t ldtpz,
+                    value_type* FXCs, int64_t ldfxcs,
+                    value_type* FXCz, int64_t ldfxcz,
+                    const IntegratorSettingsXC& ks_settings ) override;
+
+  /// ddX PSi 
+  void eval_dd_psi_( int64_t m, int64_t n, const value_type* P,
+                     int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) override;
+
+  /// ddX PhiX
+  void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx ) override;
 
   // Implementation details of integrate_den
   void integrate_den_local_work_( const value_type* P, int64_t ldp, 
@@ -99,12 +128,28 @@ class ReferenceReplicatedXCHostIntegrator :
                             task_iterator task_begin, task_iterator task_end );
                             
   // Implemetation details of exc_grad
-  void exc_grad_local_work_( const value_type* P, int64_t ldp, value_type* EXC_GRAD );
+  void exc_grad_local_work_( const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz,
+                             value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings );
 
   // Implementation details of sn-LinK
   void exx_local_work_( const value_type* P, int64_t ldp, value_type* K, int64_t ldk,
     const IntegratorSettingsEXX& settings );
 
+  // Implementation details of UKS FXC contraction
+  void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            value_type* FXCs, int64_t ldfxcs,
+                            value_type* FXCz, int64_t ldfxcz,
+                            value_type *N_EL, const IntegratorSettingsXC& ks_settings,
+                            task_iterator task_begin, task_iterator task_end );
+
+  // Implementation details of ddX Psi
+  void dd_psi_local_work_( const value_type* P, int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi );    
+
+  void dd_psi_potential_local_work_( const value_type* X, value_type* Vddx, unsigned max_Ylm );
+  
 public:
 
   template <typename... Args>
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
new file mode 100644
index 00000000..4a6b1138
--- /dev/null
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp
@@ -0,0 +1,176 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+
+#include "reference_replicated_xc_host_integrator.hpp"
+#include "integrator_util/integrator_common.hpp"
+#include "integrator_util/spherical_harmonics.hpp"
+#include "host/local_host_work_driver.hpp"
+#include <gauxc/molgrid/defaults.hpp>
+#include <stdexcept>
+#include <omp.h>
+
+namespace GauXC::detail {
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  eval_dd_psi_( int64_t m, int64_t n, const value_type* P,
+                int64_t ldp, unsigned max_Ylm, value_type* ddPsi, 
+                int64_t ldPsi ) {
+
+  const auto& basis = this->load_balancer_->basis();
+  const auto& mol   = this->load_balancer_->molecule();
+  // Check that P / VXC are sane
+  const int64_t nbf = basis.nbf();
+  if( m != n ) 
+    GAUXC_GENERIC_EXCEPTION("P Must Be Square");
+  if( m != nbf ) 
+    GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis");
+  if( ldp < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDP");
+
+  // Get Tasks
+  this->load_balancer_->get_tasks();
+  // Compute Local contributions to ddPsi
+  this->timer_.time_op("XCIntegrator.LocalWork", [&](){
+   dd_psi_local_work_( P, ldp, max_Ylm, ddPsi, ldPsi );
+  });
+
+
+  // Reduce Results
+  this->timer_.time_op("XCIntegrator.Allreduce", [&](){
+
+    if( not this->reduction_driver_->takes_host_memory() )
+      GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions");
+
+    this->reduction_driver_->allreduce_inplace( ddPsi, ldPsi * mol.size(), ReductionOp::Sum );
+
+  });
+}
+
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  dd_psi_local_work_( const value_type* P, int64_t ldp, unsigned max_Ylm,
+    value_type* dd_Psi, int64_t ldPsi) {
+
+  // Cast LWD to LocalHostWorkDriver
+  auto* lwd = dynamic_cast<LocalHostWorkDriver*>(this->local_work_driver_.get());
+
+  // Setup Aliases
+  const auto& basis = this->load_balancer_->basis();
+  const auto& mol   = this->load_balancer_->molecule();
+
+  // Atom-specific data
+  int natom = mol.size();
+  std::vector<double> radii(natom);
+  for (int i = 0; i < natom; ++i) {
+    radii[i] = uff_radius_103(mol[i].Z);
+  }
+  // Get basis map
+  BasisSetMap basis_map(basis,mol);
+
+  const int32_t nbf = basis.nbf();
+  // Sort tasks on size (XXX: maybe doesnt matter?)
+  auto task_comparator = []( const XCTask& a, const XCTask& b ) {
+    return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
+  };
+
+  auto& tasks = this->load_balancer_->get_tasks();
+  std::sort( tasks.begin(), tasks.end(), task_comparator );
+
+
+  // Compute Partition Weights
+  auto& lb_state = this->load_balancer_->state();
+  if( not lb_state.modified_weights_are_stored ) {
+    GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
+  }
+
+
+  // Loop over tasks
+  const size_t ntasks = tasks.size();
+  #pragma omp parallel
+  {
+
+  XCHostData<value_type> host_data; // Thread local host data
+
+  #pragma omp for schedule(dynamic) reduction(+:dd_Psi[:natom * ldPsi])
+  for( size_t iT = 0; iT < ntasks; ++iT ) {
+
+    // Alias current task
+    const auto& task = tasks[iT];
+
+    // Get tasks constants
+    const int32_t  npts    = task.points.size();
+    const int32_t  nbe     = task.bfn_screening.nbe;
+    const int32_t  nshells = task.bfn_screening.shell_list.size();
+
+    const auto* points      = task.points.data()->data();
+    const auto* weights     = task.weights.data();
+    const int32_t* shell_list = task.bfn_screening.shell_list.data();
+
+    // Allocate enough memory for batch
+
+    host_data.nbe_scr .resize( nbe * nbe  );
+    host_data.zmat    .resize( npts * nbe );
+
+    host_data.basis_eval .resize( npts * nbe );
+    host_data.den_scr    .resize( npts );
+
+
+    // Alias/Partition out scratch memory
+    auto* basis_eval = host_data.basis_eval.data();
+    auto* den_eval   = host_data.den_scr.data();
+    auto* nbe_scr    = host_data.nbe_scr.data();
+    auto* zmat       = host_data.zmat.data();
+
+    int nharmonics = (max_Ylm + 1) * (max_Ylm + 1);
+
+    // Get the submatrix map for batch
+    std::vector< std::array<int32_t, 3> > submat_map;
+    std::tie(submat_map, std::ignore) =
+          gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf);
+
+    // Evaluate Collocation
+    lwd->eval_collocation( npts, nshells, nbe, points, basis, shell_list, 
+      basis_eval );
+
+    // Evaluate X matrix (P * B) -> store in Z
+    lwd->eval_xmat( npts, nbf, nbe, submat_map, 1.0, P, ldp, basis_eval, nbe,
+      zmat, nbe, nbe_scr );
+
+    // Evaluate density on grid
+    lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, den_eval );
+
+    // Populate dd_Psi
+    const size_t atom_offset = task.iParent * ldPsi;
+    const double radius = radii[task.iParent];
+    const std::array<double, 3> center = {mol[task.iParent].x, mol[task.iParent].y, mol[task.iParent].z};
+
+    std::vector<double> ylm_matrix(npts * nharmonics);
+    scaled_ylm_matrix(max_Ylm, points, npts, center, radius, ylm_matrix.data());
+
+    for (int i = 0; i < npts; ++i) {
+      den_eval[i] *= -weights[i];
+    }
+    std::vector<double> offset_local_dd_psi(ldPsi, 0.0);
+    blas::gemm('N', 'N', ldPsi, 1, npts,  
+            1.0, ylm_matrix.data(), ldPsi,   
+            den_eval, npts,     
+            0.0, offset_local_dd_psi.data(), ldPsi); 
+    for (int j = 0; j < ldPsi; ++j) {
+      dd_Psi[atom_offset + j] += offset_local_dd_psi[j];
+    }
+
+  } // Loop over tasks 
+  } // End OpenMP region
+}
+} // namespace GauXC::detail
+
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
new file mode 100644
index 00000000..fad35a29
--- /dev/null
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp
@@ -0,0 +1,171 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+
+#include "reference_replicated_xc_host_integrator.hpp"
+#include "integrator_util/integrator_common.hpp"
+#include "integrator_util/spherical_harmonics.hpp"
+#include "host/local_host_work_driver.hpp"
+#include <stdexcept>
+#include <omp.h>
+#include "host/blas.hpp"
+#include "host/util.hpp"
+
+namespace GauXC::detail {
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, 
+    value_type* Vddx ) {
+
+  const auto& basis = this->load_balancer_->basis();
+  const int32_t nbf = basis.nbf();
+
+  // Check that m is natom, n is nharmonics
+  const auto& mol = this->load_balancer_->molecule();
+  const size_t natom = mol.size();
+  const size_t nharmonics = (max_Ylm + 1) * (max_Ylm + 1);
+  if (m != nharmonics || n != natom) {
+    GAUXC_GENERIC_EXCEPTION("m must be nharmonics and n must be natom");
+  }
+  // Get Tasks
+  this->load_balancer_->get_tasks();
+  // Compute Local contributions to EXC / VXC
+  this->timer_.time_op("XCIntegrator.LocalWork", [&](){
+   dd_psi_potential_local_work_( X, Vddx, max_Ylm );
+  });
+
+  // Reduce Results
+  this->timer_.time_op("XCIntegrator.Allreduce", [&](){
+
+    if( not this->reduction_driver_->takes_host_memory() )
+      GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions");
+
+    this->reduction_driver_->allreduce_inplace( Vddx, nbf * nbf, ReductionOp::Sum );
+
+  });
+}
+
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  dd_psi_potential_local_work_( const value_type* X, value_type* Vddx, unsigned max_Ylm ) {
+
+  // Cast LWD to LocalHostWorkDriver
+  auto* lwd = dynamic_cast<LocalHostWorkDriver*>(this->local_work_driver_.get());
+
+  // Setup Aliases
+  const auto& basis = this->load_balancer_->basis();
+  const auto& mol   = this->load_balancer_->molecule();
+
+  // Atom-specific data
+  std::vector<double> radii(mol.size());
+  for (int i = 0; i < mol.size(); ++i) {
+    radii[i] = uff_radius_103(mol[i].Z);
+  }
+
+  // Get basis map
+  BasisSetMap basis_map(basis,mol);
+
+  const int32_t nbf = basis.nbf();
+  // Sort tasks on size (XXX: maybe doesnt matter?)
+  auto task_comparator = []( const XCTask& a, const XCTask& b ) {
+    return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
+  };
+
+  auto& tasks = this->load_balancer_->get_tasks();
+  std::sort( tasks.begin(), tasks.end(), task_comparator );
+
+  // Compute Partition Weights
+  auto& lb_state = this->load_balancer_->state();
+  if( not lb_state.modified_weights_are_stored ) {
+    GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
+  }
+
+  // Loop over tasks
+  const size_t ntasks = tasks.size();
+
+  #pragma omp parallel
+  {
+
+  XCHostData<value_type> host_data; // Thread local host data
+
+  #pragma omp for schedule(dynamic)
+  for( size_t iT = 0; iT < ntasks; ++iT ) {
+
+    // Alias current task
+    const auto& task = tasks[iT];
+
+    // Get tasks constants
+    const int32_t  npts    = task.points.size();
+    const int32_t  nbe     = task.bfn_screening.nbe;
+    const int32_t  nshells = task.bfn_screening.shell_list.size();
+
+    const auto* points      = task.points.data()->data();
+    const auto* weights     = task.weights.data();
+    const int32_t* shell_list = task.bfn_screening.shell_list.data();
+
+    // Allocate enough memory for batch
+    host_data.basis_eval .resize( npts * nbe );
+    auto* basis_eval = host_data.basis_eval.data();
+
+    host_data.nbe_scr .resize( nbe * nbe  );
+    auto* vddx_scr = host_data.nbe_scr.data();
+
+    host_data.den_scr    .resize( npts );
+    auto etas = host_data.den_scr.data();
+
+    host_data.zmat    .resize( npts * nbe );
+    auto* zmat = host_data.zmat.data();
+    
+    int nharmonics = (max_Ylm + 1) * (max_Ylm + 1);
+
+    // Get the submatrix map for batch
+    std::vector< std::array<int32_t, 3> > submat_map;
+    std::tie(submat_map, std::ignore) =
+          gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf);
+    
+    // Evaluate Collocation
+    lwd->eval_collocation( npts, nshells, nbe, points, basis, shell_list, 
+      basis_eval );
+    
+    // Project X onto the spherical harmonics basis
+    const size_t atom_offset = task.iParent * nharmonics;
+    const double radius = radii[task.iParent];
+    std::array<double, 3> center = {mol[task.iParent].x, mol[task.iParent].y, mol[task.iParent].z};
+    const value_type* X_i = X + atom_offset;
+
+    std::vector<double> ylm_matrix(npts * nharmonics);
+    scaled_ylm_matrix(max_Ylm, points, npts, center, radius, ylm_matrix.data());
+
+    blas::gemm('T', 'N', npts, 1, nharmonics, 
+              1.0, ylm_matrix.data(), nharmonics, 
+              X_i, nharmonics,                
+              0.0, etas, npts);
+
+    // zmat = phi * etas
+    for (int ipt = 0; ipt < npts; ipt++) {
+      etas[ipt] *= weights[ipt];
+      for (int ibe = 0; ibe < nbe; ibe++) {
+        zmat[ipt * nbe + ibe] = basis_eval[ipt * nbe + ibe] * etas[ipt]; // nbe is fastest, col in column-major
+      }
+    }
+
+    // vddx_scr = phi^T * etas * weights * phi
+    blas::gemm('N', 'T', nbe, nbe, npts, 1.0, basis_eval, nbe, zmat, nbe, 0.0, vddx_scr, nbe);
+
+    detail::inc_by_submat_atomic( nbf, nbf, nbe, nbe, Vddx, nbf, vddx_scr, nbe,
+                        submat_map );
+  } // Loop over tasks 
+  } // End OpenMP region
+}
+
+} // namespace GauXC::detail
+
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp
index da6c3264..de1cb9ef 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp
index d8472c13..f04ae24b 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -18,7 +22,7 @@ namespace GauXC::detail {
 template <typename ValueType>
 void ReferenceReplicatedXCHostIntegrator<ValueType>::
   eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                 int64_t ldp, value_type* EXC_GRAD ) { 
+                 int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { 
                  
                  
   const auto& basis = this->load_balancer_->basis();
@@ -38,7 +42,49 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
                  
   // Compute Local contributions to EXC / VXC
   this->timer_.time_op("XCIntegrator.LocalWork", [&](){
-    exc_grad_local_work_( P, ldp, EXC_GRAD );
+    exc_grad_local_work_( P, ldp, nullptr, 0, EXC_GRAD, ks_settings );
+  });
+
+
+  // Reduce Results
+  this->timer_.time_op("XCIntegrator.Allreduce", [&](){
+
+    if( not this->reduction_driver_->takes_host_memory() )
+      GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions");
+
+    const int natoms = this->load_balancer_->molecule().natoms();
+    this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, ReductionOp::Sum );
+  });
+
+}
+
+
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                  const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { 
+                 
+                 
+  const auto& basis = this->load_balancer_->basis();
+
+  // Check that P is sane
+  const int64_t nbf = basis.nbf();
+  if( m != n ) 
+    GAUXC_GENERIC_EXCEPTION("P Must Be Square");
+  if( m != nbf ) 
+    GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis");
+  if( ldps < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDPS");
+  if( ldpz < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDPZ");
+                 
+                 
+  // Get Tasks
+  this->load_balancer_->get_tasks();
+                 
+  // Compute Local contributions to EXC / VXC
+  this->timer_.time_op("XCIntegrator.LocalWork", [&](){
+    exc_grad_local_work_( Ps, ldps, Pz, ldpz, EXC_GRAD, ks_settings );
   });
 
 
@@ -56,7 +102,10 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
 
 template <typename ValueType>
 void ReferenceReplicatedXCHostIntegrator<ValueType>::
-  exc_grad_local_work_( const value_type* P, int64_t ldp, value_type* EXC_GRAD ) {
+  exc_grad_local_work_( const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) {
+
+  const bool is_uks = Pz != nullptr;
+  const bool is_rks = not is_uks;
 
   // Cast LWD to LocalHostWorkDriver
   auto* lwd = dynamic_cast<LocalHostWorkDriver*>(this->local_work_driver_.get());
@@ -65,11 +114,20 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   const auto& func  = *this->func_;
   const auto& basis = this->load_balancer_->basis();
   const auto& mol   = this->load_balancer_->molecule();
+  const auto& molmeta = this->load_balancer_->molmeta();
 
   // MGGA constants
-  const size_t mmga_dim_scal = func.is_mgga() ? 4 : 1;
-  const bool needs_laplacian = func.is_mgga() ? true : false; // TODO: Check for Laplacian dependence
-							      //
+  const bool needs_laplacian = func.needs_laplacian();
+  if(needs_laplacian and is_uks) {
+    GAUXC_GENERIC_EXCEPTION("UKS Gradients + Laplacian Dependent MGGAs is Not Yet Implemented");
+  }
+
+  // Misc KS settings
+  IntegratorSettingsEXC_GRAD exc_grad_settings;
+  if( auto* tmp = dynamic_cast<const IntegratorSettingsEXC_GRAD*>(&settings) ) {
+    exc_grad_settings = *tmp;
+  }
+
   // Get basis map
   BasisSetMap basis_map(basis,mol);
 
@@ -90,6 +148,7 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   if( not lb_state.modified_weights_are_stored ) {
     GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); 
   }
+  XCWeightAlg& weight_alg = lb_state.weight_alg;
 
   // Zero out integrands
   for( auto i = 0; i < 3*natoms; ++i ) {
@@ -107,12 +166,14 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
   for( size_t iT = 0; iT < ntasks; ++iT ) {
 
     // Alias current task
-    const auto& task = tasks[iT];
+    auto& task = tasks[iT];
 
     // Get tasks constants
     const int32_t  npts    = task.points.size();
     const int32_t  nbe     = task.bfn_screening.nbe;
     const int32_t  nshells = task.bfn_screening.shell_list.size();
+    const size_t spin_dim_scal = is_rks ? 1 : 2; // last case is_uks
+    const size_t gga_dim_scal = is_rks ? 1 : 3;
 
     const auto* points      = task.points.data()->data();
     const auto* weights     = task.weights.data();
@@ -123,69 +184,75 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     // Things that every calc needs
     host_data.nbe_scr .resize( nbe * nbe  );
     host_data.eps     .resize( npts );
-    host_data.vrho    .resize( npts );
-    host_data.den_scr .resize( 4 * npts );
+    host_data.vrho    .resize( spin_dim_scal * npts );
+    host_data.den_scr .resize( 4 * spin_dim_scal * npts );
 
     if( func.is_lda() ) {
       host_data.basis_eval .resize( 4 * npts * nbe );
-      host_data.zmat       .resize( npts * nbe );
+      host_data.zmat       .resize( spin_dim_scal * npts * nbe );
     }
 
-    if( func.is_gga() ){
+    if( func.is_gga() or func.is_mgga() ) {
       host_data.basis_eval .resize( 10 * npts * nbe );
-      host_data.zmat       .resize( 4  * npts * nbe );
-      host_data.gamma      .resize( npts );
-      host_data.vgamma     .resize( npts );
+      host_data.zmat       .resize( 4  * spin_dim_scal * npts * nbe );
+      host_data.gamma      .resize( gga_dim_scal * npts );
+      host_data.vgamma     .resize( gga_dim_scal * npts );
     }
 
-#if 0
     if( func.is_mgga() ) {
-      host_data.basis_eval .resize( 11 * npts * nbe ); // basis + grad(3) + hess(6) + lapl
-      host_data.zmat       .resize(  7 * npts * nbe ); // basis + grad(3) + grad(3)
-      host_data.mmat       .resize( npts * nbe );
-      host_data.gamma      .resize( npts );
-      host_data.vgamma     .resize( npts );
-      host_data.tau        .resize( npts );
-      host_data.vtau       .resize( npts );
+      host_data.tau .resize( spin_dim_scal * npts );
+      host_data.vtau.resize( spin_dim_scal * npts );
       if ( needs_laplacian ) {
-	host_data.basis_eval.resize( 24 * npts * nbe );
-	host_data.lapl      .resize( npts );
-	host_data.vlapl     .resize( npts );
+	host_data.basis_eval.resize( 24 * npts * nbe ); // 11 + lapl_grad(3) + der3(10)
+	host_data.lapl .resize( spin_dim_scal * npts );
+	host_data.vlapl.resize( spin_dim_scal * npts );
       }
     }
-#endif
 
     // Alias/Partition out scratch memory
     auto* basis_eval = host_data.basis_eval.data();
     auto* den_eval   = host_data.den_scr.data();
     auto* nbe_scr    = host_data.nbe_scr.data();
-    auto* zmat       = host_data.zmat.data();
 
-    auto* zmat_x = zmat   + npts*nbe;
-    auto* zmat_y = zmat_x + npts*nbe;
-    auto* zmat_z = zmat_y + npts*nbe;
+    double* xNmat   = nullptr;
+    double* xNmat_x = nullptr;
+    double* xNmat_y = nullptr;
+    double* xNmat_z = nullptr;
+    double* xZmat   = nullptr;
+    double* xZmat_x = nullptr;
+    double* xZmat_y = nullptr;
+    double* xZmat_z = nullptr;
 
     auto* eps        = host_data.eps.data();
     auto* gamma      = host_data.gamma.data();
     auto* vrho       = host_data.vrho.data();
     auto* vgamma     = host_data.vgamma.data();
 
-#if 0
     auto* tau        = host_data.tau.data();
     auto* lapl       = host_data.lapl.data();
     auto* vtau       = host_data.vtau.data();
     auto* vlapl      = host_data.vlapl.data();
-    auto* mmat_x      = mmat;
-    auto* mmat_y      = mmat_x + npts * nbe;
-    auto* mmat_z      = mmat_y + npts * nbe;
-#endif
 
     auto* dbasis_x_eval = basis_eval    + npts * nbe;
     auto* dbasis_y_eval = dbasis_x_eval + npts * nbe;
     auto* dbasis_z_eval = dbasis_y_eval + npts * nbe;
-    auto* dden_x_eval   = den_eval    + npts;
-    auto* dden_y_eval   = dden_x_eval + npts;
-    auto* dden_z_eval   = dden_y_eval + npts;
+    auto* dden_x_eval   = den_eval    + spin_dim_scal * npts;
+    auto* dden_y_eval   = dden_x_eval + spin_dim_scal * npts;
+    auto* dden_z_eval   = dden_y_eval + spin_dim_scal * npts;
+    
+
+    xNmat   = host_data.zmat.data();
+    if(func.is_lda()) {
+      xZmat   = xNmat + npts*nbe;
+    } else { 
+      xNmat_x = xNmat   + npts*nbe;
+      xNmat_y = xNmat_x + npts*nbe;
+      xNmat_z = xNmat_y + npts*nbe;
+      xZmat   = xNmat_z + npts*nbe;
+      xZmat_x = xZmat   + npts*nbe;
+      xZmat_y = xZmat_x + npts*nbe;
+      xZmat_z = xZmat_y + npts*nbe;
+    }
 
     value_type* d2basis_xx_eval = nullptr;
     value_type* d2basis_xy_eval = nullptr;
@@ -193,24 +260,23 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
     value_type* d2basis_yy_eval = nullptr;
     value_type* d2basis_yz_eval = nullptr;
     value_type* d2basis_zz_eval = nullptr;
-#if 0
-    value_type* lbasis_eval = nullptr;
-    value_type* d3basis_xxx_eval = nullptr;
-    value_type* d3basis_xxy_eval = nullptr;
-    value_type* d3basis_xxz_eval = nullptr;
-    value_type* d3basis_xyy_eval = nullptr;
-    value_type* d3basis_xyz_eval = nullptr;
-    value_type* d3basis_xzz_eval = nullptr;
-    value_type* d3basis_yyy_eval = nullptr;
-    value_type* d3basis_yyz_eval = nullptr;
-    value_type* d3basis_yzz_eval = nullptr;
-    value_type* d3basis_zzz_eval = nullptr;
-    value_type* dlbasis_x_eval = nullptr;
-    value_type* dlbasis_y_eval = nullptr;
-    value_type* dlbasis_z_eval = nullptr;
-#endif
-
-    if( func.is_gga() ) {
+     
+    value_type* lbasis_eval        = nullptr;
+    value_type* d3basis_xxx_eval   = nullptr;
+    value_type* d3basis_xxy_eval   = nullptr;
+    value_type* d3basis_xxz_eval   = nullptr;
+    value_type* d3basis_xyy_eval   = nullptr;
+    value_type* d3basis_xyz_eval   = nullptr;
+    value_type* d3basis_xzz_eval   = nullptr;
+    value_type* d3basis_yyy_eval   = nullptr;
+    value_type* d3basis_yyz_eval   = nullptr;
+    value_type* d3basis_yzz_eval   = nullptr;
+    value_type* d3basis_zzz_eval   = nullptr;
+    value_type* dlgradbasis_x_eval = nullptr;
+    value_type* dlgradbasis_y_eval = nullptr;
+    value_type* dlgradbasis_z_eval = nullptr;
+
+    if( func.is_gga() or func.is_mgga() ) {
       d2basis_xx_eval = dbasis_z_eval   + npts * nbe;
       d2basis_xy_eval = d2basis_xx_eval + npts * nbe;
       d2basis_xz_eval = d2basis_xy_eval + npts * nbe;
@@ -219,32 +285,24 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       d2basis_zz_eval = d2basis_yz_eval + npts * nbe;
     }
 
-#if 0
-    if( func.is_mgga() ) {
-      d2basis_xx_eval = dbasis_z_eval   + npts * nbe;
-      d2basis_xy_eval = d2basis_xx_eval + npts * nbe;
-      d2basis_xz_eval = d2basis_xy_eval + npts * nbe;
-      d2basis_yy_eval = d2basis_xz_eval + npts * nbe;
-      d2basis_yz_eval = d2basis_yy_eval + npts * nbe;
-      d2basis_zz_eval = d2basis_yz_eval + npts * nbe;
-      if ( true ) {
-	lbasis_eval   = d2basis_zz_eval + npts * nbe;
-	d3basis_xxx_eval = lbasis_eval + npts * nbe;
-	d3basis_xxy_eval = d3basis_xxx_eval + npts * nbe;
-	d3basis_xxz_eval = d3basis_xxy_eval + npts * nbe;
-	d3basis_xyy_eval = d3basis_xxz_eval + npts * nbe;
-	d3basis_xyz_eval = d3basis_xyy_eval + npts * nbe;
-	d3basis_xzz_eval = d3basis_xyz_eval + npts * nbe;
-	d3basis_yyy_eval = d3basis_xzz_eval + npts * nbe;
-	d3basis_yyz_eval = d3basis_yyy_eval + npts * nbe;
-	d3basis_yzz_eval = d3basis_yyz_eval + npts * nbe;
-	d3basis_zzz_eval = d3basis_yzz_eval + npts * nbe;
-        dlbasis_x_eval = d3basis_zzz_eval + npts * nbe;
-	dlbasis_y_eval = dlbasis_x_eval + npts * nbe;
-	dlbasis_z_eval = dlbasis_y_eval + npts * nbe;
-      }
+    if( needs_laplacian ) {
+      lbasis_eval      = d2basis_zz_eval + npts * nbe;
+      // TODO - this should not be needed once Gau2Grid 
+      // can evaluate the laplacian gradients directly.
+      d3basis_xxx_eval = lbasis_eval      + npts * nbe;
+      d3basis_xxy_eval = d3basis_xxx_eval + npts * nbe;
+      d3basis_xxz_eval = d3basis_xxy_eval + npts * nbe;
+      d3basis_xyy_eval = d3basis_xxz_eval + npts * nbe;
+      d3basis_xyz_eval = d3basis_xyy_eval + npts * nbe;
+      d3basis_xzz_eval = d3basis_xyz_eval + npts * nbe;
+      d3basis_yyy_eval = d3basis_xzz_eval + npts * nbe;
+      d3basis_yyz_eval = d3basis_yyy_eval + npts * nbe;
+      d3basis_yzz_eval = d3basis_yyz_eval + npts * nbe;
+      d3basis_zzz_eval = d3basis_yzz_eval + npts * nbe;
+      dlgradbasis_x_eval   = d3basis_zzz_eval + npts * nbe;
+      dlgradbasis_y_eval   = dlgradbasis_x_eval   + npts * nbe;
+      dlgradbasis_z_eval   = dlgradbasis_y_eval   + npts * nbe;
     }
-#endif
 
 
     // Get the submatrix map for batch
@@ -252,84 +310,97 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       gen_compressed_submat_map( basis_map, task.bfn_screening.shell_list, nbf, nbf );
 
     // Evaluate Collocation Gradient (+ Hessian)
-#if 0
-    if( func.is_mgga() ) {
+    if( needs_laplacian ) {
       lwd->eval_collocation_der3( npts, nshells, nbe, points, basis, shell_list, 
         basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, d2basis_xx_eval,
         d2basis_xy_eval, d2basis_xz_eval, d2basis_yy_eval, d2basis_yz_eval,
         d2basis_zz_eval, d3basis_xxx_eval, d3basis_xxy_eval, d3basis_xxz_eval,
 	d3basis_xyy_eval, d3basis_xyz_eval, d3basis_xzz_eval, d3basis_yyy_eval,
 	d3basis_yyz_eval, d3basis_yzz_eval, d3basis_zzz_eval);
-
-    }
-    else if( func.is_gga() )
-#endif
-    if( func.is_gga() )
+    } else if( func.is_gga() or func.is_mgga() ) {
       lwd->eval_collocation_hessian( npts, nshells, nbe, points, basis, shell_list, 
         basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, d2basis_xx_eval,
         d2basis_xy_eval, d2basis_xz_eval, d2basis_yy_eval, d2basis_yz_eval,
         d2basis_zz_eval );
-    else
+    } else {
       lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, 
         basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval );
+    }
 
 
     // Evaluate X matrix (2 * P * B/Bx/By/Bz) -> store in Z
     // XXX: This assumes that bfn + gradients are contiguous in memory
-    if( func.is_gga() or func.is_mgga() ) {
-      lwd->eval_xmat( 4*npts, nbf, nbe, submat_map, 2.0, P, ldp, basis_eval, nbe,
-        zmat, nbe, nbe_scr );
-    } else {
-      lwd->eval_xmat( npts, nbf, nbe, submat_map, 2.0, P, ldp, basis_eval, nbe,
-        zmat, nbe, nbe_scr );
+    const auto xmat_fac = is_rks ? 2.0 : 1.0;
+    const int  xmat_len = func.is_lda() ? 1 : 4;
+    lwd->eval_xmat( xmat_len*npts, nbf, nbe, submat_map, xmat_fac, Ps, ldps, basis_eval, nbe,
+                    xNmat, nbe, nbe_scr );
+    if(is_uks) {
+      lwd->eval_xmat( xmat_len*npts, nbf, nbe, submat_map, xmat_fac, Pz, ldpz, basis_eval, nbe,
+                      xZmat, nbe, nbe_scr );
     }
 
     // Evaluate U and V variables
-#if 0
     if( func.is_mgga() ) {
       if ( needs_laplacian ) {
         blas::lacpy( 'A', nbe, npts, d2basis_xx_eval, nbe, lbasis_eval, nbe );
         blas::axpy( nbe * npts, 1., d2basis_yy_eval, 1, lbasis_eval, 1);
         blas::axpy( nbe * npts, 1., d2basis_zz_eval, 1, lbasis_eval, 1);
 
-	blas::lacpy( 'A', nbe, npts, d3basis_xxx_eval, nbe, dlbasis_x_eval, nbe );
-        blas::axpy( nbe * npts, 1., d3basis_xyy_eval, 1, dlbasis_x_eval, 1);
-        blas::axpy( nbe * npts, 1., d3basis_xzz_eval, 1, dlbasis_x_eval, 1);
+        // TODO - this should be done directly in Gau2Grid
+	blas::lacpy( 'A', nbe, npts, d3basis_xxx_eval, nbe, dlgradbasis_x_eval, nbe );
+        blas::axpy( nbe * npts, 1., d3basis_xyy_eval, 1, dlgradbasis_x_eval, 1);
+        blas::axpy( nbe * npts, 1., d3basis_xzz_eval, 1, dlgradbasis_x_eval, 1);
 
-	blas::lacpy( 'A', nbe, npts, d3basis_xxy_eval, nbe, dlbasis_y_eval, nbe );
-        blas::axpy( nbe * npts, 1., d3basis_yyy_eval, 1, dlbasis_y_eval, 1);
-        blas::axpy( nbe * npts, 1., d3basis_yzz_eval, 1, dlbasis_y_eval, 1);
+	blas::lacpy( 'A', nbe, npts, d3basis_xxy_eval, nbe, dlgradbasis_y_eval, nbe );
+        blas::axpy( nbe * npts, 1., d3basis_yyy_eval, 1, dlgradbasis_y_eval, 1);
+        blas::axpy( nbe * npts, 1., d3basis_yzz_eval, 1, dlgradbasis_y_eval, 1);
 
-	blas::lacpy( 'A', nbe, npts, d3basis_xxz_eval, nbe, dlbasis_z_eval, nbe );
-        blas::axpy( nbe * npts, 1., d3basis_yyz_eval, 1, dlbasis_z_eval, 1);
-        blas::axpy( nbe * npts, 1., d3basis_zzz_eval, 1, dlbasis_z_eval, 1);
+	blas::lacpy( 'A', nbe, npts, d3basis_xxz_eval, nbe, dlgradbasis_z_eval, nbe );
+        blas::axpy( nbe * npts, 1., d3basis_yyz_eval, 1, dlgradbasis_z_eval, 1);
+        blas::axpy( nbe * npts, 1., d3basis_zzz_eval, 1, dlgradbasis_z_eval, 1);
       }
-      lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
-        dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, nbe, 
-	den_eval, dden_x_eval, dden_y_eval, dden_z_eval,
-        gamma, tau, lapl );
+      if(is_rks)
+        lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, lbasis_eval, xNmat, nbe, xNmat_x, xNmat_y, xNmat_z, nbe, 
+          den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl );
+       else
+         lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+           dbasis_z_eval, lbasis_eval, xNmat, nbe, xZmat, nbe, xNmat_x, xNmat_y, xNmat_z, nbe, 
+           xZmat_x, xZmat_y, xZmat_z, nbe, 
+           den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl );
+    } else if( func.is_gga() ) {
+      if(is_rks)
+        lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, xNmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval,
+          gamma );
+      else
+        lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, xNmat, nbe, xZmat, nbe, den_eval, dden_x_eval, dden_y_eval, 
+          dden_z_eval, gamma );
+    } else {
+      if(is_rks) lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, xNmat, nbe, den_eval );
+      else       lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, xNmat, nbe, xZmat, nbe, den_eval );
     }
-    else if( func.is_gga() )
-#endif
-    if( func.is_gga() )
-      lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
-        dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval,
-        gamma );
-     else
-      lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, den_eval );
-
+    
 
     // Evaluate XC functional
-#if 0
     if( func.is_mgga() )
       func.eval_exc_vxc( npts, den_eval, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau );
     else if(func.is_gga() )
-#endif
-    if( func.is_gga() )
       func.eval_exc_vxc( npts, den_eval, gamma, eps, vrho, vgamma );
     else
       func.eval_exc_vxc( npts, den_eval, eps, vrho );
 
+    if(exc_grad_settings.include_weight_derivatives){
+      // grid weight contribution to exc grad
+      for( int ipt = 0; ipt < npts; ++ipt ) {
+        const auto den = is_rks ? den_eval[ipt] : (den_eval[2*ipt] + den_eval[2*ipt+1]);
+        eps[ipt] *=  den * weights[ipt];
+      }
+      lwd->eval_weight_1st_deriv_contracted( weight_alg, mol, molmeta, 
+        task, eps, EXC_GRAD);
+    }
+
 
     // Increment EXC Gradient
     size_t bf_off = 0;
@@ -337,97 +408,169 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       const int sh_idx = shell_list[ish];
       const int sh_sz  = basis[sh_idx].size();
       const int iAt    = basis_map.shell_to_center( sh_idx );
+      if(iAt == task.iParent and exc_grad_settings.include_weight_derivatives) {
+        bf_off += sh_sz; // Increment basis offset
+        continue;
+      }
 
       double g_acc_x(0), g_acc_y(0), g_acc_z(0);
       for( int ibf = 0, mu = bf_off; ibf < sh_sz; ++ibf, ++mu )
       for( int ipt = 0; ipt < npts; ++ipt ) {
 
-	      const int32_t mu_i = mu + ipt*nbe;
-
-	      // LDA Contributions
-        const double vrho_ipt = weights[ipt] * vrho[ipt];
-
-	      const double z = zmat[mu_i]; // Z = N * B
-
-	      const double dbx = dbasis_x_eval[mu_i]; // B_x
-	      const double dby = dbasis_y_eval[mu_i]; // B_y
-	      const double dbz = dbasis_z_eval[mu_i]; // B_z
-
-	      g_acc_x += vrho_ipt * z * dbx;
-	      g_acc_y += vrho_ipt * z * dby;
-	      g_acc_z += vrho_ipt * z * dbz;
-
-	      if( func.is_gga() or func.is_mgga() ) {
-      	  // GGA Contributions
-          const double vgamma_ipt = weights[ipt] * vgamma[ipt];
-
-          const double dden_x = dden_x_eval[ipt];
-          const double dden_y = dden_y_eval[ipt];
-          const double dden_z = dden_z_eval[ipt];
-
-	        const double zx = zmat_x[mu_i]; // Z_x = N * B_x
-	        const double zy = zmat_y[mu_i]; // Z_y = N * B_y
-	        const double zz = zmat_z[mu_i]; // Z_z = N * B_z
-
-	        const double d2bxx = d2basis_xx_eval[mu_i]; // B^2_xx
-	        const double d2bxy = d2basis_xy_eval[mu_i]; // B^2_xy
-	        const double d2bxz = d2basis_xz_eval[mu_i]; // B^2_xz
-	        const double d2byy = d2basis_yy_eval[mu_i]; // B^2_yy
-	        const double d2byz = d2basis_yz_eval[mu_i]; // B^2_yz
-	        const double d2bzz = d2basis_zz_eval[mu_i]; // B^2_zz
-
-	        // sum_j B^2_{ij} * d_j n
-	        double d2_term_x = d2bxx * dden_x + d2bxy * dden_y + d2bxz * dden_z;
-	        double d2_term_y = d2bxy * dden_x + d2byy * dden_y + d2byz * dden_z;
-	        double d2_term_z = d2bxz * dden_x + d2byz * dden_y + d2bzz * dden_z;
-
-	        // sum_j (d_j n) * Z^j
-	        double d11_zmat_term = dden_x * zx + dden_y * zy + dden_z * zz;
-
-	        g_acc_x += 2 * vgamma_ipt * ( z * d2_term_x + dbx * d11_zmat_term );
-	        g_acc_y += 2 * vgamma_ipt * ( z * d2_term_y + dby * d11_zmat_term );
-	        g_acc_z += 2 * vgamma_ipt * ( z * d2_term_z + dbz * d11_zmat_term );
-	      }
-#if 0
-	      if( func.is_mgga() ) {
-
-                const double vtau_ipt = 0.5 * weights[ipt] * vtau[ipt];
-	        const double zx = zmat_x[mu_i]; // Z_x = N * B_x
-	        const double zy = zmat_y[mu_i]; // Z_y = N * B_y
-	        const double zz = zmat_z[mu_i]; // Z_z = N * B_z
-	        const double d2bxx = d2basis_xx_eval[mu_i]; // B^2_xx
-	        const double d2bxy = d2basis_xy_eval[mu_i]; // B^2_xy
-	        const double d2bxz = d2basis_xz_eval[mu_i]; // B^2_xz
-	        const double d2byy = d2basis_yy_eval[mu_i]; // B^2_yy
-	        const double d2byz = d2basis_yz_eval[mu_i]; // B^2_yz
-	        const double d2bzz = d2basis_zz_eval[mu_i]; // B^2_zz
-		double d2_term_x = d2bxx * zx + d2bxy * zy + d2bxz * zz;
-		double d2_term_y = d2bxy * zx + d2byy * zy + d2byz * zz;
-		double d2_term_z = d2bxz * zx + d2byz * zy + d2bzz * zz;
-
-		g_acc_x += vtau_ipt * d2_term_x;
-		g_acc_y += vtau_ipt * d2_term_y;
-		g_acc_z += vtau_ipt * d2_term_z;
-
-		if ( needs_laplacian ) {
-		  const double vlapl_ipt = weights[ipt] * vlapl[ipt];
-		  const double lbf = lbasis_eval[mu_i];
-                  const double dlbx = dlbasis_x_eval[mu_i];
-                  const double dlby = dlbasis_y_eval[mu_i];
-                  const double dlbz = dlbasis_z_eval[mu_i];
-		  d2_term_x = z * dlbx + zx * lbf + 2.0*d2_term_x;
-		  d2_term_y = z * dlby + zy * lbf + 2.0*d2_term_y;
-		  d2_term_z = z * dlbz + zz * lbf + 2.0*d2_term_z;
-
-		  g_acc_x += vlapl_ipt * d2_term_x;
-		  g_acc_y += vlapl_ipt * d2_term_y;
-		  g_acc_z += vlapl_ipt * d2_term_z;
-
-		}
-
-	      }
-#endif
-
+        const int32_t mu_i = mu + ipt*nbe;
+
+        // LDA Contributions
+        // vrhop is actually vrhon for RKS
+        const double vrhop_ipt = weights[ipt] * vrho[spin_dim_scal * ipt];
+        const double vrhom_ipt = is_uks ? weights[ipt] * vrho[spin_dim_scal * ipt + 1] : 0.0;
+
+	const double xN = xNmat[mu_i]; // X = N * B
+        const double xZ = is_uks ? xZmat[mu_i] : 0.0;
+
+	const double dbx = dbasis_x_eval[mu_i]; // B_x
+	const double dby = dbasis_y_eval[mu_i]; // B_y
+	const double dbz = dbasis_z_eval[mu_i]; // B_z
+
+        if(is_rks) {
+          g_acc_x += vrhop_ipt * xN * dbx;
+          g_acc_y += vrhop_ipt * xN * dby;
+          g_acc_z += vrhop_ipt * xN * dbz;
+        } else {
+          const auto vrhon_ipt = vrhop_ipt + vrhom_ipt;
+          const auto vrhoz_ipt = vrhop_ipt - vrhom_ipt;
+          g_acc_x += 0.5 * vrhon_ipt * xN * dbx;
+          g_acc_y += 0.5 * vrhon_ipt * xN * dby;
+          g_acc_z += 0.5 * vrhon_ipt * xN * dbz;
+
+          g_acc_x += 0.5 * vrhoz_ipt * xZ * dbx;
+          g_acc_y += 0.5 * vrhoz_ipt * xZ * dby;
+          g_acc_z += 0.5 * vrhoz_ipt * xZ * dbz;
+        }
+
+
+        if( func.is_gga() or func.is_mgga() ) {
+          // GGA Contributions
+          const double vgammapp_ipt = weights[ipt] * vgamma[gga_dim_scal * ipt + 0];
+          const double vgammapm_ipt = is_uks ? weights[ipt] * vgamma[gga_dim_scal * ipt + 1] : 0.0;
+          const double vgammamm_ipt = is_uks ? weights[ipt] * vgamma[gga_dim_scal * ipt + 2] : 0.0;
+
+          const double ddenn_x = dden_x_eval[spin_dim_scal * ipt];
+          const double ddenn_y = dden_y_eval[spin_dim_scal * ipt];
+          const double ddenn_z = dden_z_eval[spin_dim_scal * ipt];
+          const double ddenz_x = is_uks ? dden_x_eval[spin_dim_scal * ipt + 1] : 0.0;
+          const double ddenz_y = is_uks ? dden_y_eval[spin_dim_scal * ipt + 1] : 0.0;
+          const double ddenz_z = is_uks ? dden_z_eval[spin_dim_scal * ipt + 1] : 0.0;
+
+          const double xNx = xNmat_x[mu_i]; // XN_x = N * B_x
+          const double xNy = xNmat_y[mu_i]; // XN_y = N * B_y
+          const double xNz = xNmat_z[mu_i]; // XN_z = N * B_z
+
+          const double xZx = is_uks ? xZmat_x[mu_i] : 0.0;
+          const double xZy = is_uks ? xZmat_y[mu_i] : 0.0;
+          const double xZz = is_uks ? xZmat_z[mu_i] : 0.0;
+
+          const double d2bxx = d2basis_xx_eval[mu_i]; // B^2_xx
+          const double d2bxy = d2basis_xy_eval[mu_i]; // B^2_xy
+          const double d2bxz = d2basis_xz_eval[mu_i]; // B^2_xz
+          const double d2byy = d2basis_yy_eval[mu_i]; // B^2_yy
+          const double d2byz = d2basis_yz_eval[mu_i]; // B^2_yz
+          const double d2bzz = d2basis_zz_eval[mu_i]; // B^2_zz
+      
+          if(is_rks) {
+            // sum_j B^2_{ij} * d_j n
+            const auto d2_term_x = d2bxx * ddenn_x + d2bxy * ddenn_y + d2bxz * ddenn_z;
+            const auto d2_term_y = d2bxy * ddenn_x + d2byy * ddenn_y + d2byz * ddenn_z;
+            const auto d2_term_z = d2bxz * ddenn_x + d2byz * ddenn_y + d2bzz * ddenn_z;
+
+            // sum_j (d_j n) * xN^j
+            const double d11_xmat_term = ddenn_x * xNx + ddenn_y * xNy + ddenn_z * xNz;
+
+            g_acc_x += 2 * vgammapp_ipt * ( xN * d2_term_x + dbx * d11_xmat_term );
+            g_acc_y += 2 * vgammapp_ipt * ( xN * d2_term_y + dby * d11_xmat_term );
+            g_acc_z += 2 * vgammapp_ipt * ( xN * d2_term_z + dbz * d11_xmat_term );
+          } else {
+            // sum_j B^2_{ij} * d_j n
+            const auto d2n_term_x = d2bxx * ddenn_x + d2bxy * ddenn_y + d2bxz * ddenn_z;
+            const auto d2n_term_y = d2bxy * ddenn_x + d2byy * ddenn_y + d2byz * ddenn_z;
+            const auto d2n_term_z = d2bxz * ddenn_x + d2byz * ddenn_y + d2bzz * ddenn_z;
+
+            // sum_j B^2_{ij} * d_j m_z
+            const auto d2z_term_x = d2bxx * ddenz_x + d2bxy * ddenz_y + d2bxz * ddenz_z;
+            const auto d2z_term_y = d2bxy * ddenz_x + d2byy * ddenz_y + d2byz * ddenz_z;
+            const auto d2z_term_z = d2bxz * ddenz_x + d2byz * ddenz_y + d2bzz * ddenz_z;
+
+            // sum_j (d_j n) * xN^j
+            const double d11nn_xmat_term = ddenn_x * xNx + ddenn_y * xNy + ddenn_z * xNz;
+            // sum_j (d_j n) * xZ^j
+            const double d11nz_xmat_term = ddenn_x * xZx + ddenn_y * xZy + ddenn_z * xZz;
+            // sum_j (d_j m_z) * xN^j
+            const double d11zn_xmat_term = ddenz_x * xNx + ddenz_y * xNy + ddenz_z * xNz;
+            // sum_j (d_j m_z) * xZ^j
+            const double d11zz_xmat_term = ddenz_x * xZx + ddenz_y * xZy + ddenz_z * xZz;
+
+
+            g_acc_x += 0.5 * (vgammapp_ipt + vgammapm_ipt + vgammamm_ipt) * (d2n_term_x * xN + d11nn_xmat_term * dbx);
+            g_acc_x += 0.5 * (vgammapp_ipt                - vgammamm_ipt) * (d2z_term_x * xN + d11zn_xmat_term * dbx);
+            g_acc_x += 0.5 * (vgammapp_ipt                - vgammamm_ipt) * (d2n_term_x * xZ + d11nz_xmat_term * dbx);
+            g_acc_x += 0.5 * (vgammapp_ipt - vgammapm_ipt + vgammamm_ipt) * (d2z_term_x * xZ + d11zz_xmat_term * dbx);
+
+            g_acc_y += 0.5 * (vgammapp_ipt + vgammapm_ipt + vgammamm_ipt) * (d2n_term_y * xN + d11nn_xmat_term * dby);
+            g_acc_y += 0.5 * (vgammapp_ipt                - vgammamm_ipt) * (d2z_term_y * xN + d11zn_xmat_term * dby);
+            g_acc_y += 0.5 * (vgammapp_ipt                - vgammamm_ipt) * (d2n_term_y * xZ + d11nz_xmat_term * dby);
+            g_acc_y += 0.5 * (vgammapp_ipt - vgammapm_ipt + vgammamm_ipt) * (d2z_term_y * xZ + d11zz_xmat_term * dby);
+
+            g_acc_z += 0.5 * (vgammapp_ipt + vgammapm_ipt + vgammamm_ipt) * (d2n_term_z * xN + d11nn_xmat_term * dbz);
+            g_acc_z += 0.5 * (vgammapp_ipt                - vgammamm_ipt) * (d2z_term_z * xN + d11zn_xmat_term * dbz);
+            g_acc_z += 0.5 * (vgammapp_ipt                - vgammamm_ipt) * (d2n_term_z * xZ + d11nz_xmat_term * dbz);
+            g_acc_z += 0.5 * (vgammapp_ipt - vgammapm_ipt + vgammamm_ipt) * (d2z_term_z * xZ + d11zz_xmat_term * dbz);
+            
+          }
+
+          if( func.is_mgga() ) {
+            // vtaup is actually vtaun for RKS
+            const double vtaup_ipt = 0.5 * weights[ipt] * vtau[spin_dim_scal * ipt + 0];
+            const double vtaum_ipt = is_uks ? 0.5 * weights[ipt] * vtau[spin_dim_scal * ipt + 1] : 0.0;
+
+            auto d2_term_x = d2bxx * xNx + d2bxy * xNy + d2bxz * xNz;
+            auto d2_term_y = d2bxy * xNx + d2byy * xNy + d2byz * xNz;
+            auto d2_term_z = d2bxz * xNx + d2byz * xNy + d2bzz * xNz;
+
+            if(is_rks) {
+              g_acc_x += vtaup_ipt * d2_term_x;
+              g_acc_y += vtaup_ipt * d2_term_y;
+              g_acc_z += vtaup_ipt * d2_term_z;
+            } else {
+              const auto vtaun_ipt = vtaup_ipt + vtaum_ipt;
+              const auto vtauz_ipt = vtaup_ipt - vtaum_ipt;
+              g_acc_x += 0.5 * vtaun_ipt * d2_term_x;
+              g_acc_y += 0.5 * vtaun_ipt * d2_term_y;
+              g_acc_z += 0.5 * vtaun_ipt * d2_term_z;
+
+              d2_term_x = d2bxx * xZx + d2bxy * xZy + d2bxz * xZz;
+              d2_term_y = d2bxy * xZx + d2byy * xZy + d2byz * xZz;
+              d2_term_z = d2bxz * xZx + d2byz * xZy + d2bzz * xZz;
+
+              g_acc_x += 0.5 * vtauz_ipt * d2_term_x;
+              g_acc_y += 0.5 * vtauz_ipt * d2_term_y;
+              g_acc_z += 0.5 * vtauz_ipt * d2_term_z;
+            }
+
+            if( needs_laplacian ) {
+              const double vlapl_ipt = weights[ipt] * vlapl[ipt];
+              const double lbf = lbasis_eval[mu_i];
+              const double dlbx = dlgradbasis_x_eval[mu_i];
+              const double dlby = dlgradbasis_y_eval[mu_i];
+              const double dlbz = dlgradbasis_z_eval[mu_i];
+              d2_term_x = xN * dlbx + xNx * lbf + 2.0*d2_term_x;
+              d2_term_y = xN * dlby + xNy * lbf + 2.0*d2_term_y;
+              d2_term_z = xN * dlbz + xNz * lbf + 2.0*d2_term_z;
+
+              g_acc_x += vlapl_ipt * d2_term_x;
+              g_acc_y += vlapl_ipt * d2_term_y;
+              g_acc_z += vlapl_ipt * d2_term_z;
+            }
+          }
+        }
       } // loop over bfns + grid points
 
       #pragma omp atomic
@@ -437,10 +580,19 @@ void ReferenceReplicatedXCHostIntegrator<ValueType>::
       #pragma omp atomic
       EXC_GRAD[3*iAt + 2] += -2 * g_acc_z;
 
+      if(exc_grad_settings.include_weight_derivatives){
+        #pragma omp atomic
+        EXC_GRAD[3*task.iParent + 0] -= -2 * g_acc_x;
+        #pragma omp atomic
+        EXC_GRAD[3*task.iParent + 1] -= -2 * g_acc_y;
+        #pragma omp atomic
+        EXC_GRAD[3*task.iParent + 2] -= -2 * g_acc_z;
+      }
+
       bf_off += sh_sz; // Increment basis offset
 
     } // End loop over shells 
-        
+
   } // End loop over tasks
 
   } // OpenMP Region
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
index e62ae760..141085c9 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
index 117bbb5c..7cce12de 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp
new file mode 100644
index 00000000..192fe0f8
--- /dev/null
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp
@@ -0,0 +1,620 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+
+#include "reference_replicated_xc_host_integrator.hpp"
+#include "integrator_util/integrator_common.hpp"
+#include "host/local_host_work_driver.hpp"
+#include "host/blas.hpp"
+#include <stdexcept>
+
+namespace GauXC::detail {
+
+/**
+ *  Generic implementation of FXC contraction for RKS/UKS/GKS
+ *  
+ */
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  eval_fxc_contraction_( int64_t m, int64_t n, 
+                        const value_type* Ps, int64_t ldps,
+                        const value_type* Pz, int64_t ldpz,
+                        const value_type* tPs, int64_t ldtps,
+                        const value_type* tPz, int64_t ldtpz,
+                        value_type* FXCs, int64_t ldfxcs,
+                        value_type* FXCz, int64_t ldfxcz,
+                        const IntegratorSettingsXC& ks_settings ){
+
+  const auto& basis = this->load_balancer_->basis();
+
+  // Check that P / FXC are sane
+  const int64_t nbf = basis.nbf();
+  if( m != n )
+    GAUXC_GENERIC_EXCEPTION("P/FXC Must Be Square");
+  if( m != nbf )
+    GAUXC_GENERIC_EXCEPTION("P/FXC Must Have Same Dimension as Basis");
+    
+  if( ldps < nbf )
+  GAUXC_GENERIC_EXCEPTION("Invalid LDPS");
+  if( ldpz and ldpz < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDPZ");
+  if( ldtps and ldtps < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDTPS");
+  if( ldtpz and ldtpz < nbf ) 
+    GAUXC_GENERIC_EXCEPTION("Invalid LDTZP");
+  if( ldfxcs < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDFXCS");
+  if( ldfxcz and ldfxcz < nbf )
+    GAUXC_GENERIC_EXCEPTION("Invalid LDFXCZ");
+
+
+  // Get Tasks
+  auto& tasks = this->load_balancer_->get_tasks();
+
+  // Temporary electron count to judge integrator accuracy
+  value_type N_EL;
+   
+  // Compute Local contributions to FXC contraction
+  this->timer_.time_op("XCIntegrator.LocalWork", [&](){
+    fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, 
+                                             tPs, ldtps, tPz, ldtpz,
+                                             FXCs, ldfxcs, FXCz, ldfxcz,
+                                             &N_EL, ks_settings,
+                                             tasks.begin(), tasks.end() );
+  });
+
+
+  // Reduce Results
+  this->timer_.time_op("XCIntegrator.Allreduce", [&](){
+
+    if( not this->reduction_driver_->takes_host_memory() )
+      GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions");
+
+    this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum );
+    if( FXCz ) this->reduction_driver_->allreduce_inplace( FXCz, nbf*nbf, ReductionOp::Sum );
+
+    this->reduction_driver_->allreduce_inplace( &N_EL, 1    , ReductionOp::Sum );
+
+  });
+
+
+}
+
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+  fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps,
+                            const value_type* Pz, int64_t ldpz,
+                            const value_type* tPs, int64_t ldtps,
+                            const value_type* tPz, int64_t ldtpz,
+                            value_type* FXCs, int64_t ldfxcs,
+                            value_type* FXCz, int64_t ldfxcz,
+                            value_type *N_EL, const IntegratorSettingsXC& settings,
+                            task_iterator task_begin, task_iterator task_end ) {
+                                    
+  const bool is_uks = Pz != nullptr;
+  const bool is_rks = not is_uks;
+
+  // Misc KS settings
+  IntegratorSettingsKS ks_settings;
+  if( auto* tmp = dynamic_cast<const IntegratorSettingsKS*>(&settings) ) {
+    ks_settings = *tmp;
+  }
+
+  // Cast LWD to LocalHostWorkDriver
+  auto* lwd = dynamic_cast<LocalHostWorkDriver*>(this->local_work_driver_.get());
+
+  // Setup Aliases
+  const auto& func  = *this->func_;
+  const auto& mol   = this->load_balancer_->molecule();
+
+  const bool needs_laplacian = func.needs_laplacian(); 
+  // not suppport laplacian yet
+  if( needs_laplacian ) {
+    GAUXC_GENERIC_EXCEPTION("Laplacian Not Supported Yet for FXC Contraction");
+  }
+
+  // Get basis map
+  BasisSetMap basis_map(basis,mol);
+
+  const int32_t nbf = basis.nbf();
+
+  // Sort tasks on size (XXX: maybe doesnt matter?)
+  auto task_comparator = []( const XCTask& a, const XCTask& b ) {
+    return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
+  };
+
+  auto& tasks = this->load_balancer_->get_tasks();
+  std::sort( task_begin, task_end, task_comparator );
+
+  // Check that Partition Weights have been calculated
+  auto& lb_state = this->load_balancer_->state();
+  if( not lb_state.modified_weights_are_stored ) {
+    GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified");
+  }
+
+
+  // Zero out integrands
+  for( auto j = 0; j < nbf; ++j ) 
+    for( auto i = 0; i < nbf; ++i ) 
+      FXCs[i + j*ldfxcs] = 0.;
+    
+  if(FXCz)
+    for( auto j = 0; j < nbf; ++j ) 
+      for( auto i = 0; i < nbf; ++i ) 
+        FXCz[i + j*ldfxcz] = 0.;
+
+
+  // Use FXCs and FXCz  to store FXCa and FXCb temporarily
+  value_type* FXCa = FXCs;
+  value_type* FXCb = FXCz;
+  int64_t ldfxca = ldfxcs;
+  int64_t ldfxcb = ldfxcz;
+ 
+  double NEL_WORK = 0.0;
+    
+  // Loop over tasks
+  const size_t ntasks = std::distance(task_begin, task_end);
+
+  #pragma omp parallel
+  {
+
+  XCHostData<value_type> host_data; // Thread local host data
+
+  #pragma omp for schedule(dynamic)
+  for( size_t iT = 0; iT < ntasks; ++iT ) {
+     
+    //std::cout << iT << "/" << ntasks << std::endl;
+    //if(is_exc_only) printf("%lu / %lu\n", iT, ntasks);
+    // Alias current task
+    const auto& task = *(task_begin + iT);
+
+    // Get tasks constants
+    const int32_t  npts    = task.points.size();
+    const int32_t  nbe     = task.bfn_screening.nbe;
+    const int32_t  nshells = task.bfn_screening.shell_list.size();
+
+    const auto* points      = task.points.data()->data();
+    const auto* weights     = task.weights.data();
+    const int32_t* shell_list = task.bfn_screening.shell_list.data();
+
+    // Allocate enough memory for batch
+   
+    const size_t spin_dim_scal = is_rks ? 1 : 2; 
+    const size_t sds          = is_rks ? 1 : 2;
+    const size_t mgga_dim_scal = func.is_mgga() ? 4 : 1; // basis + d1basis
+    // for second derivatives
+    const size_t spin_dim_rhorho = is_rks ? 1 : 3;
+    const size_t spin_dim_gammagamma = is_rks ? 1 : 6; 
+    const size_t spin_dim_rhogamma = is_rks ? 1 : 6;
+    const size_t spin_dim_rhotau = is_rks ? 1 : 4;
+
+    // Things that every calc needs
+    host_data.nbe_scr .resize(nbe  * nbe);
+    host_data.zmat    .resize(npts * nbe * spin_dim_scal * mgga_dim_scal); 
+    host_data.vrho    .resize(npts * spin_dim_scal);
+    host_data.v2rho2  .resize(npts * spin_dim_rhorho);
+    host_data.FXC_A       .resize(npts * spin_dim_scal);
+
+    // LDA data requirements
+    if( func.is_lda() ){
+      host_data.basis_eval .resize( npts * nbe );
+      host_data.den_scr    .resize( npts * spin_dim_scal);
+      host_data.tden_scr   .resize( npts * spin_dim_scal);
+    }
+     
+    // GGA data requirements
+    const size_t gga_dim_scal = is_rks ? 1 : 3;
+    if( func.is_gga() ){
+      host_data.basis_eval .resize( 4 * npts * nbe );
+      host_data.den_scr    .resize( spin_dim_scal * 4 * npts );
+      host_data.tden_scr   .resize( spin_dim_scal * 4 * npts );
+      host_data.gamma      .resize( gga_dim_scal * npts );
+      host_data.vgamma     .resize( gga_dim_scal * npts );
+
+      // second derivatives
+      host_data.v2rhogamma .resize(npts * spin_dim_rhogamma);
+      host_data.v2gamma2   .resize(npts * spin_dim_gammagamma);
+      host_data.FXC_B          .resize(npts * 3 * spin_dim_scal);
+    }
+
+    if( func.is_mgga() ){
+
+      host_data.den_scr    .resize( spin_dim_scal * 4 * npts );
+      host_data.tden_scr   .resize( spin_dim_scal * 4 * npts );
+      host_data.gamma      .resize( gga_dim_scal * npts );
+      host_data.vgamma     .resize( gga_dim_scal * npts );
+      host_data.tau        .resize( npts * spin_dim_scal );
+      host_data.vtau       .resize( npts * spin_dim_scal );
+      
+      // second derivatives
+      host_data.v2rhogamma .resize(npts * spin_dim_rhogamma);
+      host_data.v2rhotau   .resize(npts * spin_dim_rhotau);
+      host_data.v2gamma2   .resize(npts * spin_dim_gammagamma);
+      host_data.v2gammatau .resize(npts * spin_dim_rhogamma);
+      host_data.v2tau2     .resize(npts * spin_dim_rhorho);
+      host_data.ttau       .resize(npts * spin_dim_scal);
+      host_data.FXC_B          .resize(npts * 3 * spin_dim_scal);
+      host_data.FXC_C          .resize(npts * spin_dim_scal);
+
+      if ( needs_laplacian ) {
+        host_data.basis_eval .resize( 11 * npts * nbe ); // basis + grad (3) + hess (6) + lapl 
+        host_data.lapl       .resize( spin_dim_scal * npts );
+        host_data.vlapl      .resize( spin_dim_scal * npts );
+        host_data.v2lapl2    .resize(npts * spin_dim_rhorho);
+        host_data.v2rholapl  .resize(npts * spin_dim_rhotau);
+        host_data.v2gammalapl.resize(npts * spin_dim_rhogamma);
+        host_data.v2lapltau  .resize(npts * spin_dim_rhotau);
+        host_data.tlapl      .resize(npts * spin_dim_scal);
+
+      } else {
+        host_data.basis_eval .resize( 4 * npts * nbe ); // basis + grad (3)
+      }
+    }
+
+
+    // Alias/Partition out scratch memory
+    auto* basis_eval = host_data.basis_eval.data();
+    auto* den_eval   = host_data.den_scr.data();
+    auto* tden_eval   = host_data.tden_scr.data(); // trial density and gradient
+    auto* nbe_scr    = host_data.nbe_scr.data();
+    auto* zmat       = host_data.zmat.data();
+
+    decltype(zmat) zmat_z = nullptr;
+    if(!is_rks) {
+      zmat_z = zmat + mgga_dim_scal * nbe * npts;
+    }
+     
+    auto* eps        = host_data.eps.data();
+    auto* gamma      = host_data.gamma.data();
+    auto* tau        = host_data.tau.data();
+    auto* lapl       = host_data.lapl.data();
+    auto* vrho       = host_data.vrho.data();
+    auto* vgamma     = host_data.vgamma.data();
+    auto* vtau       = host_data.vtau.data();
+    auto* vlapl      = host_data.vlapl.data();
+
+    // second derivatives
+    auto* v2rho2     = host_data.v2rho2.data();
+    auto* v2rhogamma = host_data.v2rhogamma.data();
+    auto* v2gamma2   = host_data.v2gamma2.data();
+    auto* v2gammatau = host_data.v2gammatau.data();
+    auto* v2rhotau   = host_data.v2rhotau.data();
+    auto* v2lapl2    = host_data.v2lapl2.data();
+    auto* v2rholapl  = host_data.v2rholapl.data();
+    auto* v2gammalapl= host_data.v2gammalapl.data();
+    auto* v2lapltau  = host_data.v2lapltau.data();
+    auto* v2tau2     = host_data.v2tau2.data();
+    auto* ttau       = host_data.ttau.data();
+    auto* tlapl      = host_data.tlapl.data();
+    auto* FXC_A          = host_data.FXC_A.data();
+    auto* FXC_B          = host_data.FXC_B.data();
+    auto* FXC_C          = host_data.FXC_C.data();
+
+
+    value_type* dbasis_x_eval = nullptr;
+    value_type* dbasis_y_eval = nullptr;
+    value_type* dbasis_z_eval = nullptr;
+    value_type* d2basis_xx_eval = nullptr;
+    value_type* d2basis_xy_eval = nullptr;
+    value_type* d2basis_xz_eval = nullptr;
+    value_type* d2basis_yy_eval = nullptr;
+    value_type* d2basis_yz_eval = nullptr;
+    value_type* d2basis_zz_eval = nullptr;
+    value_type* lbasis_eval = nullptr;
+    value_type* dden_x_eval = nullptr;
+    value_type* dden_y_eval = nullptr;
+    value_type* dden_z_eval = nullptr;
+    value_type* tdden_x_eval = nullptr;
+    value_type* tdden_y_eval = nullptr;
+    value_type* tdden_z_eval = nullptr;
+    value_type* mmat_x      = nullptr;
+    value_type* mmat_y      = nullptr;
+    value_type* mmat_z      = nullptr;
+    value_type* mmat_x_z    = nullptr;
+    value_type* mmat_y_z    = nullptr;
+    value_type* mmat_z_z    = nullptr;
+
+    if( func.is_gga() || func.is_mgga() ) {
+      dbasis_x_eval = basis_eval    + npts * nbe;
+      dbasis_y_eval = dbasis_x_eval + npts * nbe;
+      dbasis_z_eval = dbasis_y_eval + npts * nbe;
+      dden_x_eval   = den_eval    + spin_dim_scal * npts;
+      dden_y_eval   = dden_x_eval + spin_dim_scal * npts;
+      dden_z_eval   = dden_y_eval + spin_dim_scal * npts;
+      tdden_x_eval  = tden_eval   + spin_dim_scal * npts;
+      tdden_y_eval  = tdden_x_eval+ spin_dim_scal * npts;
+      tdden_z_eval  = tdden_y_eval+ spin_dim_scal * npts;
+    }
+
+    if ( func.is_mgga() ) {
+      mmat_x        = zmat + npts * nbe;
+      mmat_y        = mmat_x + npts * nbe;
+      mmat_z        = mmat_y + npts * nbe;
+      if ( needs_laplacian ) {
+        d2basis_xx_eval = dbasis_z_eval + npts * nbe;
+        d2basis_xy_eval = d2basis_xx_eval + npts * nbe;
+        d2basis_xz_eval = d2basis_xy_eval + npts * nbe;
+        d2basis_yy_eval = d2basis_xz_eval + npts * nbe;
+        d2basis_yz_eval = d2basis_yy_eval + npts * nbe;
+        d2basis_zz_eval = d2basis_yz_eval + npts * nbe;
+        lbasis_eval     = d2basis_zz_eval + npts * nbe;
+      }
+      if(is_uks) {
+        mmat_x_z = zmat_z + npts * nbe;
+        mmat_y_z = mmat_x_z + npts * nbe;
+        mmat_z_z = mmat_y_z + npts * nbe;
+      }
+    }
+
+
+    // Get the submatrix map for batch
+    std::vector< std::array<int32_t, 3> > submat_map;
+    std::tie(submat_map, std::ignore) =
+          gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf);
+
+    // Evaluate Collocation (+ Grad and Hessian)
+    if( func.is_mgga() ) {
+      if ( needs_laplacian ) {
+        // TODO: Modify gau2grid to compute Laplacian instead of full hessian
+        lwd->eval_collocation_hessian( npts, nshells, nbe, points, basis, shell_list,
+          basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, d2basis_xx_eval,
+          d2basis_xy_eval, d2basis_xz_eval, d2basis_yy_eval, d2basis_yz_eval,
+          d2basis_zz_eval);
+        blas::lacpy( 'A', nbe, npts, d2basis_xx_eval, nbe, lbasis_eval, nbe );
+        blas::axpy( nbe * npts, 1., d2basis_yy_eval, 1, lbasis_eval, 1);
+        blas::axpy( nbe * npts, 1., d2basis_zz_eval, 1, lbasis_eval, 1);
+      } else {
+        lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list,
+          basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval );
+      }
+    }
+    // Evaluate Collocation (+ Grad)
+    else if( func.is_gga() )
+      lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list,
+        basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval );
+    else
+      lwd->eval_collocation( npts, nshells, nbe, points, basis, shell_list,
+        basis_eval );
+
+     
+    // Evaluate X matrix (fac * P * B) -> store in Z
+    const auto xmat_fac = is_rks ? 2.0 : 1.0; // TODO Fix for spinor RKS input
+    lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, Ps, ldps, basis_eval, nbe,
+      zmat, nbe, nbe_scr );
+    // X matrix for Pz
+    if(not is_rks) {
+      lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, 1.0, Pz, ldpz, basis_eval, nbe,
+        zmat_z, nbe, nbe_scr);
+    }     
+     
+    // Evaluate U and V variables
+    if( func.is_mgga() ) {
+      if (is_rks) {
+        lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, 
+          nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl);
+      } else if (is_uks) {
+        lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, lbasis_eval, zmat, nbe, zmat_z, nbe, 
+          mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe, 
+          den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl);
+      }
+    } else if ( func.is_gga() ) {
+      if(is_rks) {
+        lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval,
+          gamma );
+      } else if(is_uks) {
+        lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, zmat, nbe, zmat_z, nbe, den_eval, dden_x_eval, 
+          dden_y_eval, dden_z_eval, gamma );
+      }
+  
+     } else {
+      if(is_rks) {
+        lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, den_eval );
+      } else if(is_uks) {
+        lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, zmat, nbe, zmat_z, nbe,
+          den_eval );
+      }
+     }
+
+    // Evaluate XC functional
+    if( func.is_mgga() )
+      func.eval_vxc_fxc( npts, den_eval, gamma, lapl, tau, vrho, vgamma, vlapl, vtau,
+        v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, 
+        v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2);
+    else if( func.is_gga() )
+      func.eval_vxc_fxc( npts, den_eval, gamma, vrho, vgamma, v2rho2, v2rhogamma, v2gamma2 );
+    else
+      func.eval_vxc_fxc( npts, den_eval, vrho, v2rho2 );
+
+    //calculate the trial density variables
+    // Evaluate X matrix (fac * tP * B) -> store in Z
+    lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, tPs, ldps, basis_eval, nbe,
+      zmat, nbe, nbe_scr );
+    // X matrix for tPz
+    if(not is_rks) {
+      lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, 1.0, tPz, ldpz, basis_eval, nbe,
+        zmat_z, nbe, nbe_scr);
+    }
+    // Evaluate U and V trial variables
+    if( func.is_mgga() ) {
+      if (is_rks) {
+        lwd->eval_uvvar_mgga_rks(  npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, 
+          nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl);
+      lwd->eval_tmat_mgga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, 
+        v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, 
+        tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C );
+      } else if (is_uks) {
+      // tgamma is not needed since it has different definitions than gamma
+      // gamma  = nabla rho * nabla rho, but tgamma = nabla trho * nabla rho, not both trho
+      lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+        dbasis_z_eval, lbasis_eval, zmat, nbe, zmat_z, nbe, 
+        mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe, 
+        tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl);
+      lwd->eval_tmat_mgga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, 
+        v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, 
+        tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C );
+      }
+    } else if ( func.is_gga() ) {
+      if(is_rks) {
+        lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+          dbasis_z_eval, zmat, nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval,
+          gamma );
+        lwd->eval_tmat_gga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, 
+          tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B );
+      } else if(is_uks) {
+      // tgamma is not needed since it has quite different definitions than gamma
+      lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval,
+        dbasis_z_eval, zmat, nbe, zmat_z, nbe, tden_eval, tdden_x_eval, 
+        tdden_y_eval, tdden_z_eval, gamma ); 
+      lwd->eval_tmat_gga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, 
+        tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B );
+      }
+    } else {
+      // LDA
+      if(is_rks) {
+        lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, tden_eval );
+        lwd->eval_tmat_lda_vxc_rks( npts, v2rho2, tden_eval, FXC_A);
+      } else if(is_uks) {
+        lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, zmat, nbe, zmat_z, nbe,
+          tden_eval );
+        lwd->eval_tmat_lda_vxc_uks( npts, v2rho2, tden_eval, FXC_A);
+      }
+    }
+
+    // Factor weights into XC results
+    for( int32_t i = 0; i < npts; ++i ) {
+      FXC_A[sds*i] *= weights[i];
+      if(not is_rks) FXC_A[sds*i+1] *= weights[i];
+    }
+    if( func.is_gga() || func.is_mgga()){
+      for( int32_t i = 0; i < npts; ++i ) {
+        FXC_B[3*sds*i] *= weights[i];
+        FXC_B[3*sds*i+1] *= weights[i];
+        FXC_B[3*sds*i+2] *= weights[i];
+        if(not is_rks) {
+          FXC_B[3*sds*i+3] *= weights[i];
+          FXC_B[3*sds*i+4] *= weights[i];
+          FXC_B[3*sds*i+5] *= weights[i];
+         }
+      }
+    }
+    if( func.is_mgga() ){
+      for( int32_t i = 0; i < npts; ++i) {
+        FXC_C[sds*i] *= weights[i];
+        if(not is_rks) FXC_C[sds*i+1] *= weights[i];
+      }
+    }
+
+    // Scalar integrations
+    double NEL_local = 0.0;
+    for( int32_t i = 0; i < npts; ++i ) {
+      const auto den = is_rks ? den_eval[i] : (den_eval[2*i] + den_eval[2*i+1]);
+      NEL_local += weights[i] * den;
+    }
+
+
+    // Atomic updates
+    #pragma omp atomic
+    NEL_WORK += NEL_local;
+    // Evaluate Z matrix for VXC
+    if( func.is_mgga() ) {
+      if(is_rks) {
+        // Because we do not support Laplacian, so mgga will do the same operation as GGA
+        lwd->eval_zmat_gga_vxc_rks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval,
+                                dbasis_y_eval, dbasis_z_eval, zmat, nbe);
+        lwd->eval_mmat_mgga_vxc_rks( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval,
+                                     mmat_x, mmat_y, mmat_z, nbe);
+      } else if (is_uks) {
+        // Because we do not support Laplacian, so mgga will do the same operation as GGA
+        lwd->eval_zmat_gga_vxc_uks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval,
+                                dbasis_y_eval, dbasis_z_eval, zmat, nbe, zmat_z, nbe);
+        lwd->eval_mmat_mgga_vxc_uks_ts( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval,
+                                     mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe);
+      }
+    }
+    else if( func.is_gga() ) {
+      if(is_rks) {
+        lwd->eval_zmat_gga_vxc_rks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval,
+                                dbasis_y_eval, dbasis_z_eval, zmat, nbe);
+      } else if(is_uks) {
+        lwd->eval_zmat_gga_vxc_uks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval,
+                                dbasis_y_eval, dbasis_z_eval, zmat, nbe, zmat_z, nbe);
+      } 
+       
+    } else {
+      if(is_rks) {
+        lwd->eval_zmat_lda_vxc_rks( npts, nbe, FXC_A, basis_eval, zmat, nbe );
+      } else if(is_uks) {
+        lwd->eval_zmat_lda_vxc_uks_ts( npts, nbe, FXC_A, basis_eval, zmat, nbe, zmat_z, nbe );
+      }
+    }
+     
+    // Incremeta LT of VXC
+    {
+
+      // Increment VXC
+      lwd->inc_vxc( mgga_dim_scal * npts, nbf, nbe, basis_eval, submat_map, zmat, nbe, FXCa, ldfxca, nbe_scr );
+      if( not is_rks )
+        lwd->inc_vxc( mgga_dim_scal * npts, nbf, nbe, basis_eval, submat_map, zmat_z, nbe, FXCb, ldfxcb, nbe_scr);
+    }
+
+  } // Loop over tasks
+
+  } // End OpenMP region
+
+
+  // Set scalar return values
+  *N_EL = NEL_WORK;
+
+    // Symmetrize VXC
+  for( int32_t j = 0;   j < nbf; ++j ) 
+    for( int32_t i = j+1; i < nbf; ++i ) 
+      FXCa[ j + i*ldfxca ] = FXCa[ i + j*ldfxca ];
+      
+  if ( FXCz )
+    for( int32_t j = 0;   j < nbf; ++j ) 
+      for( int32_t i = j+1; i < nbf; ++i ) 
+        FXCb[ j + i*ldfxcb ] = FXCb[ i + j*ldfxcb ];
+
+  if( FXCz ) 
+    // now convert to the final form of FXCs and FXCz
+    for ( int32_t j = 0;   j < nbf; ++j ) 
+      for( int32_t i = 0; i < nbf; ++i ) {
+        value_type tmp_a = FXCa[ i + j*ldfxca ];
+        value_type tmp_b = FXCb[ i + j*ldfxcb ];
+        FXCs[ i + j*ldfxcs ] = 0.5 * ( tmp_a + tmp_b );
+        FXCz[ i + j*ldfxcz ] = 0.5 * ( tmp_a - tmp_b );
+      }
+  
+} 
+
+
+  /// RKS FXC contraction
+template <typename ValueType>
+void ReferenceReplicatedXCHostIntegrator<ValueType>::
+eval_fxc_contraction_( int64_t m, int64_t n, 
+    const value_type* P, int64_t ldp, 
+    const value_type* tP, int64_t ldtp,
+    value_type* FXC, int64_t ldfxc,
+    const IntegratorSettingsXC& ks_settings ){
+
+    eval_fxc_contraction_( m, n, P, ldp, nullptr, 0, tP, ldtp, nullptr, 0,
+      FXC, ldfxc, nullptr, 0, ks_settings );
+}
+
+
+
+} // namespace GauXC::detail
diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp
index d327a4ea..e0ad145f 100644
--- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp
+++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx
index 4fd53aef..72ef87b8 100644
--- a/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx
+++ b/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx
index 4bdd2c66..c972d30a 100644
--- a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx
+++ b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -11,6 +15,9 @@
 #include "shell_batched_replicated_xc_integrator_exc_vxc.hpp"
 #include "shell_batched_replicated_xc_integrator_exc_grad.hpp"
 #include "shell_batched_replicated_xc_integrator_exx.hpp"
+#include "shell_batched_replicated_xc_integrator_fxc_contraction.hpp"
+#include "shell_batched_replicated_xc_integrator_dd_psi.hpp"
+#include "shell_batched_replicated_xc_integrator_dd_psi_potential.hpp"
 
 namespace GauXC  {
 namespace detail {
diff --git a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp
index 3c3db085..a8f1f488 100644
--- a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp
+++ b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/replicated/host/xc_host_data.hpp b/src/xc_integrator/replicated/host/xc_host_data.hpp
index 5649d523..1c7fc9a2 100644
--- a/src/xc_integrator/replicated/host/xc_host_data.hpp
+++ b/src/xc_integrator/replicated/host/xc_host_data.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -30,6 +34,27 @@ struct XCHostData {
   std::vector<F> nbe_scr;
   std::vector<F> den_scr;
   std::vector<F> basis_eval;
+
+  // Second order derivatives
+  std::vector<F> v2rho2;
+  std::vector<F> v2rhogamma;
+  std::vector<F> v2rholapl;
+  std::vector<F> v2rhotau;
+  std::vector<F> v2gamma2;
+  std::vector<F> v2gammalapl;
+  std::vector<F> v2gammatau;
+  std::vector<F> v2lapl2;
+  std::vector<F> v2lapltau;
+  std::vector<F> v2tau2;
+
+  // For Fxc contraction
+  std::vector<F> FXC_A;
+  std::vector<F> FXC_B;
+  std::vector<F> FXC_C;
+  std::vector<F> tden_scr;
+  std::vector<F> ttau;
+  std::vector<F> tlapl;
+
    
   inline XCHostData() {}
 
diff --git a/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx b/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx
index b1d50523..071afe31 100644
--- a/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx
+++ b/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -120,9 +124,19 @@ void ReplicatedXCIntegratorImpl<ValueType>::
 template <typename ValueType>
 void ReplicatedXCIntegratorImpl<ValueType>::
   eval_exc_grad( int64_t m, int64_t n, const value_type* P,
-                int64_t ldp, value_type* EXC_GRAD ) {
+                int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) {
 
-    eval_exc_grad_(m,n,P,ldp,EXC_GRAD);
+    eval_exc_grad_(m,n,P,ldp,EXC_GRAD, ks_settings);
+
+}
+
+
+template <typename ValueType>
+void ReplicatedXCIntegratorImpl<ValueType>::
+  eval_exc_grad( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                 const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) {
+
+    eval_exc_grad_(m,n,Ps,ldps,Pz,ldpz,EXC_GRAD, ks_settings);
 
 }
 
@@ -136,6 +150,66 @@ void ReplicatedXCIntegratorImpl<ValueType>::
 
 }
 
+template <typename ValueType>
+void ReplicatedXCIntegratorImpl<ValueType>::
+eval_fxc_contraction( int64_t m, int64_t n, const value_type* P,
+                      int64_t ldp,
+                      const value_type* tP, int64_t ldtp,
+                      value_type* FXC, int64_t ldfxc,
+                      const IntegratorSettingsXC& ks_settings ) {
+
+  // For RKS, we can reuse the UKS implementation with Pz=0, tPz=0
+  // Create temporary buffers to store the z-component results
+  std::vector<value_type> temp_fxcz(m * n, 0.0);
+  value_type* FXCz = temp_fxcz.data();
+  int64_t ldfxcz = m;
+
+  eval_fxc_contraction_(m, n, P, ldp,
+                      tP, ldtp,
+                      FXC, ldfxc,
+                      ks_settings);
+
+}
+
+template <typename ValueType>
+void ReplicatedXCIntegratorImpl<ValueType>::
+eval_fxc_contraction( int64_t m, int64_t n, const value_type* Ps,
+                      int64_t ldps,
+                      const value_type* Pz, int64_t ldpz,
+                      const value_type* tPs, int64_t ldtps,
+                      const value_type* tPz, int64_t ldtpz,
+                      value_type* FXCs, int64_t ldfxcs,
+                      value_type* FXCz, int64_t ldfxcz,
+                      const IntegratorSettingsXC& ks_settings ) {
+
+  eval_fxc_contraction_(m,n,Ps,ldps,
+                        Pz,ldpz,
+                        tPs,ldtps,
+                        tPz,ldtpz,
+                        FXCs,ldfxcs,
+                        FXCz,ldfxcz,
+                        ks_settings);    
+
+}
+
+template <typename ValueType>
+void ReplicatedXCIntegratorImpl<ValueType>::
+  eval_dd_psi( int64_t m, int64_t n, const value_type* P,
+               int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) {
+
+  eval_dd_psi_(m, n, P, ldp, max_Ylm, ddPsi, ldPsi);
+
+}
+
+template <typename ValueType>
+void ReplicatedXCIntegratorImpl<ValueType>::
+  eval_dd_psi_potential( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx) {
+  
+  eval_dd_psi_potential_(m, n, X, max_Ylm, Vddx);
+  
+}
+  
+
 template class ReplicatedXCIntegratorImpl<double>;
 
 }
diff --git a/src/xc_integrator/shell_batched/CMakeLists.txt b/src/xc_integrator/shell_batched/CMakeLists.txt
index 636666c4..771124a4 100644
--- a/src/xc_integrator/shell_batched/CMakeLists.txt
+++ b/src/xc_integrator/shell_batched/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp
index c6201a73..5c1d4a94 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -83,15 +87,40 @@ class ShellBatchedReplicatedXCIntegrator :
 
 
   /// RKS EXC Gradient
-  void eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                       int64_t ldp, value_type* EXC_GRAD ) override;
+  void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, 
+                       value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override;
+  /// UKS EXC Gradient
+  void eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                       const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override;
 
   /// sn-LinK
   void eval_exx_( int64_t m, int64_t n, const value_type* P,
                   int64_t ldp, value_type* K, int64_t ldk,
                   const IntegratorSettingsEXX& settings ) override;
 
-
+  // RKS FXC contraction
+  void eval_fxc_contraction_( int64_t m, int64_t n, 
+                              const value_type* P, int64_t ldp,  
+                              const value_type* tP, int64_t ldtp,
+                              value_type* FXC, int64_t ldfxc,
+                              const IntegratorSettingsXC& ks_settings ) override;
+
+  // UKS FXC contraction
+  void eval_fxc_contraction_( int64_t m, int64_t n, 
+                              const value_type* Ps, int64_t ldps,   
+                              const value_type* Pz, int64_t ldpz,
+                              const value_type* tPs, int64_t ldtps,
+                              const value_type* tPz, int64_t ldtpz,
+                              value_type* FXCs, int64_t ldfxcs,
+                              value_type* FXCz, int64_t ldfxcz,
+                              const IntegratorSettingsXC& ks_settings ) override;
+
+  /// ddX PSi 
+  void eval_dd_psi_( int64_t m, int64_t n, const value_type* P,
+                     int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) override;
+
+  /// ddX PhiX
+  void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx ) override;
 
 
   // Implementation details of exc_vxc (for RKS/UKS/GKS deduced from input character)
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi.hpp
new file mode 100644
index 00000000..689e16a7
--- /dev/null
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi.hpp
@@ -0,0 +1,30 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "shell_batched_replicated_xc_integrator.hpp"
+#include <gauxc/util/misc.hpp>
+#include <gauxc/util/unused.hpp>
+
+namespace GauXC  {
+namespace detail {
+
+template <typename BaseIntegratorType, typename IncoreIntegratorType>
+void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType>::
+  eval_dd_psi_( int64_t m, int64_t n, const value_type* P,
+                int64_t ldp, unsigned max_Ylm, 
+                value_type* ddPsi, int64_t ldPsi ) {
+  GAUXC_GENERIC_EXCEPTION("ShellBatched DD-PSI NYI");                 
+  util::unused(m,n,P,ldp, max_Ylm, ddPsi,ldPsi);
+}
+
+}
+}
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi_potential.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi_potential.hpp
new file mode 100644
index 00000000..639508b2
--- /dev/null
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi_potential.hpp
@@ -0,0 +1,28 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "shell_batched_replicated_xc_integrator.hpp"
+#include <gauxc/util/misc.hpp>
+#include <gauxc/util/unused.hpp>
+
+namespace GauXC  {
+namespace detail {
+
+template <typename BaseIntegratorType, typename IncoreIntegratorType>
+void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType>::
+  eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx ) {
+  GAUXC_GENERIC_EXCEPTION("ShellBatched DD-PSI-DERIV NYI");                 
+  util::unused(m,n,X,max_Ylm, Vddx);
+}
+
+}
+}
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp
index 4635336a..2a5565c9 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp
index dde98bdd..f329bc02 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -15,12 +19,20 @@ namespace detail {
 
 template <typename BaseIntegratorType, typename IncoreIntegratorType>
 void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType>::
-  eval_exc_grad_( int64_t m, int64_t n, const value_type* P,
-                 int64_t ldp, value_type* EXC_GRAD ) { 
+  eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { 
                  
   GAUXC_GENERIC_EXCEPTION("ShellBatched exc_grad NYI" );                 
   util::unused(m,n,P,ldp,EXC_GRAD);
 }
 
+template <typename BaseIntegratorType, typename IncoreIntegratorType>
+void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType>::
+  eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, 
+                  const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { 
+                 
+  GAUXC_GENERIC_EXCEPTION("ShellBatched exc_grad NYI" );                 
+  util::unused(m,n,Ps,ldps,Pz,lpdz,EXC_GRAD);
+}
+
 }
 }
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
index 5a65be8e..3dd43f4d 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp
index 0db24197..e6e90f8d 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp
new file mode 100644
index 00000000..289de960
--- /dev/null
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp
@@ -0,0 +1,50 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#pragma once
+#include "shell_batched_replicated_xc_integrator.hpp"
+#include <gauxc/util/misc.hpp>
+#include <gauxc/util/unused.hpp>
+
+namespace GauXC  {
+namespace detail {
+
+template <typename BaseIntegratorType, typename IncoreIntegratorType>
+void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType>::
+  eval_fxc_contraction_( int64_t m, int64_t n, 
+                        const value_type* P, int64_t ldp,  
+                        const value_type* tP, int64_t ldtp,
+                        value_type* FXC, int64_t ldfxc,
+                        const IntegratorSettingsXC& ks_settings ) {
+  GAUXC_GENERIC_EXCEPTION("ShellBatched FXC contraction NYI");            
+  util::unused(m,n,P,ldp,tP,ldtp,FXC,ldfxc,ks_settings);
+
+}
+
+template <typename BaseIntegratorType, typename IncoreIntegratorType>
+void ShellBatchedReplicatedXCIntegrator<BaseIntegratorType, IncoreIntegratorType>::
+  eval_fxc_contraction_( int64_t m, int64_t n, 
+                        const value_type* Ps, int64_t ldps,
+                        const value_type* Pz, int64_t ldpz,
+                        const value_type* tPs, int64_t ldtps,
+                        const value_type* tPz, int64_t ldtpz,
+                        value_type* FXCs, int64_t ldfxcs,
+                        value_type* FXCz, int64_t ldfxcz,
+                        const IntegratorSettingsXC& ks_settings ) {
+  GAUXC_GENERIC_EXCEPTION("ShellBatched FXC contraction NYI");            
+  util::unused(m,n,Ps,ldps,Pz,ldpz,tPs,ldtps,tPz,ldtpz,
+                 FXCs,ldfxcs,FXCz,ldfxcz);
+
+}
+
+
+}
+}
diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp
index ce9194d1..e0a24504 100644
--- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx
index 314f0027..4d5a3156 100644
--- a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx
+++ b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp
index 1d04169d..c528e067 100644
--- a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp
+++ b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/xc_data/CMakeLists.txt b/src/xc_integrator/xc_data/CMakeLists.txt
index 711dac33..f06826e1 100644
--- a/src/xc_integrator/xc_data/CMakeLists.txt
+++ b/src/xc_integrator/xc_data/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/xc_data/buffer_adaptor.hpp b/src/xc_integrator/xc_data/buffer_adaptor.hpp
index 49179886..741aaaec 100644
--- a/src/xc_integrator/xc_data/buffer_adaptor.hpp
+++ b/src/xc_integrator/xc_data/buffer_adaptor.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/xc_data/device/CMakeLists.txt b/src/xc_integrator/xc_data/device/CMakeLists.txt
index d2c79570..571a7cf6 100644
--- a/src/xc_integrator/xc_data/device/CMakeLists.txt
+++ b/src/xc_integrator/xc_data/device/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx b/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx
index af985115..2e043842 100644
--- a/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx
+++ b/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -51,10 +55,11 @@ size_t XCDeviceAoSData::get_mem_req( integrator_term_tracker terms,
 
   return base_size + 
     // Collocation + Derivatives
-    reqt.task_bfn_size     ( nbe_bfn, npts ) * sizeof(double) +
-    reqt.task_bfn_grad_size( nbe_bfn, npts ) * sizeof(double) +
-    reqt.task_bfn_hess_size( nbe_bfn, npts ) * sizeof(double) +
-    reqt.task_bfn_lapl_size( nbe_bfn, npts ) * sizeof(double) +
+    reqt.task_bfn_size     ( nbe_bfn, npts )    * sizeof(double) +
+    reqt.task_bfn_grad_size( nbe_bfn, npts )    * sizeof(double) +
+    reqt.task_bfn_hess_size( nbe_bfn, npts )    * sizeof(double) +
+    reqt.task_bfn_lapl_size( nbe_bfn, npts )    * sizeof(double) +
+    reqt.task_bfn_lapgrad_size( nbe_bfn, npts ) * sizeof(double) +
 
     // LDA/GGA Z Matrix
     reqt.task_zmat_size( nbe_bfn, npts ) * sizeof(double) +
@@ -62,6 +67,9 @@ size_t XCDeviceAoSData::get_mem_req( integrator_term_tracker terms,
     // X Matrix Gradient
     reqt.task_xmat_grad_size( nbe_bfn, npts ) * sizeof(double) +
 
+    // Persistent X Mat
+    reqt.task_xmat_persist_size( nbe_bfn, npts ) * sizeof(double) +
+
     // EXX Intermediates
     reqt.task_fmat_size( nbe_cou, npts ) * sizeof(double) +
     reqt.task_gmat_size( nbe_cou, npts ) * sizeof(double) +
@@ -191,6 +199,12 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack(
     aos_stack.d2bf_lapl_eval_device = mem.aligned_alloc<double>( bfn_msz, csl );
   }
 
+  if(reqt.task_bfn_lapgrad) {
+    aos_stack.d3bf_lapgrad_x_eval_device = mem.aligned_alloc<double>( bfn_msz, csl );
+    aos_stack.d3bf_lapgrad_y_eval_device = mem.aligned_alloc<double>( bfn_msz, csl );
+    aos_stack.d3bf_lapgrad_z_eval_device = mem.aligned_alloc<double>( bfn_msz, csl );
+  }
+
   // VXC Z Matrix
   if(reqt.task_zmat) {
     aos_stack.zmat_vxc_device = 
@@ -203,6 +217,20 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack(
     aos_stack.xmat_dz_device = mem.aligned_alloc<double>( bfn_msz, csl);
   }
 
+  // Persistent X Matrix Gradient
+  if(reqt.task_xmat_persist) {
+    aos_stack.xmatS_device    = mem.aligned_alloc<double>( bfn_msz, csl);
+    aos_stack.xmatZ_device    = mem.aligned_alloc<double>( bfn_msz, csl);
+    if(reqt.task_xmat_grad) { 
+      aos_stack.xmatS_dx_device = mem.aligned_alloc<double>( bfn_msz, csl);
+      aos_stack.xmatS_dy_device = mem.aligned_alloc<double>( bfn_msz, csl);
+      aos_stack.xmatS_dz_device = mem.aligned_alloc<double>( bfn_msz, csl);
+      aos_stack.xmatZ_dx_device = mem.aligned_alloc<double>( bfn_msz, csl);
+      aos_stack.xmatZ_dy_device = mem.aligned_alloc<double>( bfn_msz, csl);
+      aos_stack.xmatZ_dz_device = mem.aligned_alloc<double>( bfn_msz, csl);
+    }
+  }
+
   // EXX Intermediates
   if(reqt.task_fmat) {
     aos_stack.fmat_exx_device = 
@@ -466,9 +494,26 @@ void XCDeviceAoSData::pack_and_send(
     buffer_adaptor d2bf_lapl_mem( aos_stack.d2bf_lapl_eval_device, 
       total_nbe_bfn_npts );
 
+    buffer_adaptor d3bf_lapgrad_x_mem( aos_stack.d3bf_lapgrad_x_eval_device, 
+      total_nbe_bfn_npts );
+    buffer_adaptor d3bf_lapgrad_y_mem( aos_stack.d3bf_lapgrad_y_eval_device, 
+      total_nbe_bfn_npts );
+    buffer_adaptor d3bf_lapgrad_z_mem( aos_stack.d3bf_lapgrad_z_eval_device, 
+      total_nbe_bfn_npts );
+
     buffer_adaptor xmat_dx_mem( aos_stack.xmat_dx_device, total_nbe_bfn_npts );
     buffer_adaptor xmat_dy_mem( aos_stack.xmat_dy_device, total_nbe_bfn_npts );
     buffer_adaptor xmat_dz_mem( aos_stack.xmat_dz_device, total_nbe_bfn_npts );
+
+    buffer_adaptor xmatS_mem( aos_stack.xmatS_device, total_nbe_bfn_npts );
+    buffer_adaptor xmatS_dx_mem( aos_stack.xmatS_dx_device, total_nbe_bfn_npts );
+    buffer_adaptor xmatS_dy_mem( aos_stack.xmatS_dy_device, total_nbe_bfn_npts );
+    buffer_adaptor xmatS_dz_mem( aos_stack.xmatS_dz_device, total_nbe_bfn_npts );
+
+    buffer_adaptor xmatZ_mem( aos_stack.xmatZ_device, total_nbe_bfn_npts );
+    buffer_adaptor xmatZ_dx_mem( aos_stack.xmatZ_dx_device, total_nbe_bfn_npts );
+    buffer_adaptor xmatZ_dy_mem( aos_stack.xmatZ_dy_device, total_nbe_bfn_npts );
+    buffer_adaptor xmatZ_dz_mem( aos_stack.xmatZ_dz_device, total_nbe_bfn_npts );
     
     const bool is_rks = terms.ks_scheme == RKS;
     const bool is_uks = terms.ks_scheme == UKS;
@@ -477,38 +522,52 @@ void XCDeviceAoSData::pack_and_send(
     const bool is_gga = terms.xc_approx == GGA;
     const int den_fac   = is_pol ? 2 : 1;
     const int gamma_fac = is_pol ? 3 : 1;
-    
+    // second derivative
+    const int rhorho_fac   = is_pol ? 3 : 1;
+    const int rhogamma_fac = is_pol ? 6 : 1;
+    const int rhotau_fac   = is_pol ? 4 : 1;
 
 
     buffer_adaptor eps_mem    ( base_stack.eps_eval_device,     total_npts             );
 
     // RKS
-    buffer_adaptor den_s_mem  ( base_stack.den_s_eval_device,     total_npts  );
-    buffer_adaptor gamma_mem  ( base_stack.gamma_eval_device,     total_npts * gamma_fac );
-    buffer_adaptor vrho_mem   ( base_stack.vrho_eval_device,      total_npts * den_fac   );
-    buffer_adaptor vgamma_mem ( base_stack.vgamma_eval_device,    total_npts * gamma_fac );
-    
-    buffer_adaptor den_mem    ( base_stack.den_eval_device,       total_npts * den_fac   );
-       
+    buffer_adaptor den_s_mem  ( base_stack.den_s_eval_device,  total_npts  );
+    buffer_adaptor tau_s_mem  ( base_stack.tau_s_eval_device,  total_npts  );
+    buffer_adaptor lapl_s_mem ( base_stack.lapl_s_eval_device, total_npts  );
+    buffer_adaptor gamma_mem  ( base_stack.gamma_eval_device,  total_npts * gamma_fac );
+    buffer_adaptor vrho_mem   ( base_stack.vrho_eval_device,   total_npts * den_fac   );
+    buffer_adaptor vgamma_mem ( base_stack.vgamma_eval_device, total_npts * gamma_fac );
+    buffer_adaptor vtau_mem   ( base_stack.vtau_eval_device,   total_npts * den_fac   );
+    buffer_adaptor vlapl_mem  ( base_stack.vlapl_eval_device,  total_npts * den_fac   );
 
     // Polarized KS
-    buffer_adaptor den_z_mem  ( base_stack.den_z_eval_device,     total_npts  );
-    buffer_adaptor den_y_mem  ( base_stack.den_y_eval_device,     total_npts  );
-    buffer_adaptor den_x_mem  ( base_stack.den_x_eval_device,     total_npts  );
+    buffer_adaptor den_interleaved_mem  ( base_stack.den_interleaved_device,  total_npts * den_fac   );
+    buffer_adaptor tau_interleaved_mem  ( base_stack.tau_interleaved_device,  total_npts * den_fac   );
+    buffer_adaptor lapl_interleaved_mem ( base_stack.lapl_interleaved_device, total_npts * den_fac   );
+    buffer_adaptor den_z_mem  ( base_stack.den_z_eval_device,  total_npts  );
+    buffer_adaptor den_y_mem  ( base_stack.den_y_eval_device,  total_npts  );
+    buffer_adaptor den_x_mem  ( base_stack.den_x_eval_device,  total_npts  );
+    buffer_adaptor tau_z_mem  ( base_stack.tau_z_eval_device,  total_npts  );
+    buffer_adaptor lapl_z_mem ( base_stack.lapl_z_eval_device, total_npts  );
+
     buffer_adaptor vrho_pos_mem( base_stack.vrho_pos_eval_device, total_npts );
     buffer_adaptor vrho_neg_mem( base_stack.vrho_neg_eval_device, total_npts );
-    buffer_adaptor K_z_mem    ( base_stack.K_z_eval_device,       total_npts );
-    buffer_adaptor K_y_mem    ( base_stack.K_y_eval_device,       total_npts );
-    buffer_adaptor K_x_mem    ( base_stack.K_x_eval_device,       total_npts );
-    buffer_adaptor H_z_mem    ( base_stack.H_z_eval_device,       total_npts );
-    buffer_adaptor H_y_mem    ( base_stack.H_y_eval_device,       total_npts );
-    buffer_adaptor H_x_mem    ( base_stack.H_x_eval_device,       total_npts );
+    buffer_adaptor vtau_pos_mem( base_stack.vtau_pos_eval_device, total_npts );
+    buffer_adaptor vtau_neg_mem( base_stack.vtau_neg_eval_device, total_npts );
+    buffer_adaptor vlapl_pos_mem( base_stack.vlapl_pos_eval_device, total_npts );
+    buffer_adaptor vlapl_neg_mem( base_stack.vlapl_neg_eval_device, total_npts );
     buffer_adaptor gamma_pp_mem( base_stack.gamma_pp_eval_device, total_npts );
     buffer_adaptor gamma_pm_mem( base_stack.gamma_pm_eval_device, total_npts );
     buffer_adaptor gamma_mm_mem( base_stack.gamma_mm_eval_device, total_npts );
     buffer_adaptor vgamma_pp_mem( base_stack.vgamma_pp_eval_device, total_npts );
     buffer_adaptor vgamma_pm_mem( base_stack.vgamma_pm_eval_device, total_npts );
     buffer_adaptor vgamma_mm_mem( base_stack.vgamma_mm_eval_device, total_npts );
+    buffer_adaptor K_z_mem    ( base_stack.K_z_eval_device,       total_npts );
+    buffer_adaptor K_y_mem    ( base_stack.K_y_eval_device,       total_npts );
+    buffer_adaptor K_x_mem    ( base_stack.K_x_eval_device,       total_npts );
+    buffer_adaptor H_z_mem    ( base_stack.H_z_eval_device,       total_npts );
+    buffer_adaptor H_y_mem    ( base_stack.H_y_eval_device,       total_npts );
+    buffer_adaptor H_x_mem    ( base_stack.H_x_eval_device,       total_npts );
 
     // Gradients
     buffer_adaptor dden_sx_mem( base_stack.dden_sx_eval_device,     total_npts );
@@ -523,12 +582,101 @@ void XCDeviceAoSData::pack_and_send(
     buffer_adaptor dden_xx_mem( base_stack.dden_xx_eval_device,     total_npts );
     buffer_adaptor dden_xy_mem( base_stack.dden_xy_eval_device,     total_npts );
     buffer_adaptor dden_xz_mem( base_stack.dden_xz_eval_device,     total_npts );
-    
-    // MGGA
-    buffer_adaptor dden_lapl_mem( base_stack.den_lapl_eval_device, total_npts );
-    buffer_adaptor vlapl_mem( base_stack.vlapl_eval_device, total_npts );
-    buffer_adaptor tau_mem( base_stack.tau_eval_device, total_npts );
-    buffer_adaptor vtau_mem( base_stack.vtau_eval_device, total_npts );
+
+    // second derivative
+    // RKS
+    buffer_adaptor tden_s_mem( base_stack.tden_s_eval_device, total_npts );
+    buffer_adaptor ttau_s_mem( base_stack.ttau_s_eval_device, total_npts );
+    buffer_adaptor tlapl_s_mem( base_stack.tlapl_s_eval_device, total_npts );
+    buffer_adaptor v2rho2_mem( base_stack.v2rho2_eval_device, total_npts * rhorho_fac );
+    buffer_adaptor v2rhogamma_mem( base_stack.v2rhogamma_eval_device, total_npts * rhogamma_fac );
+    buffer_adaptor v2rholapl_mem( base_stack.v2rholapl_eval_device, total_npts * rhotau_fac );
+    buffer_adaptor v2rhotau_mem( base_stack.v2rhotau_eval_device, total_npts * rhotau_fac );
+    buffer_adaptor v2gamma2_mem( base_stack.v2gamma2_eval_device, total_npts * rhogamma_fac );
+    buffer_adaptor v2gammalapl_mem( base_stack.v2gammalapl_eval_device, total_npts * rhogamma_fac );
+    buffer_adaptor v2gammatau_mem( base_stack.v2gammatau_eval_device, total_npts * rhogamma_fac );
+    buffer_adaptor v2lapl2_mem( base_stack.v2lapl2_eval_device, total_npts * rhorho_fac );
+    buffer_adaptor v2lapltau_mem( base_stack.v2lapltau_eval_device, total_npts * rhotau_fac );
+    buffer_adaptor v2tau2_mem( base_stack.v2tau2_eval_device, total_npts * rhorho_fac );
+
+    // Polarized KS
+    buffer_adaptor tden_z_mem( base_stack.tden_z_eval_device, total_npts );
+    buffer_adaptor tden_y_mem( base_stack.tden_y_eval_device, total_npts );
+    buffer_adaptor tden_x_mem( base_stack.tden_x_eval_device, total_npts );
+    buffer_adaptor ttau_z_mem( base_stack.ttau_z_eval_device, total_npts );
+    buffer_adaptor tlapl_z_mem( base_stack.tlapl_z_eval_device, total_npts );
+
+    buffer_adaptor v2rho2_a_a_mem( base_stack.v2rho2_a_a_eval_device, total_npts );
+    buffer_adaptor v2rho2_a_b_mem( base_stack.v2rho2_a_b_eval_device, total_npts );
+    buffer_adaptor v2rho2_b_b_mem( base_stack.v2rho2_b_b_eval_device, total_npts );
+    buffer_adaptor v2rhogamma_a_aa_mem( base_stack.v2rhogamma_a_aa_eval_device, total_npts );
+    buffer_adaptor v2rhogamma_a_ab_mem( base_stack.v2rhogamma_a_ab_eval_device, total_npts );
+    buffer_adaptor v2rhogamma_a_bb_mem( base_stack.v2rhogamma_a_bb_eval_device, total_npts );
+    buffer_adaptor v2rhogamma_b_aa_mem( base_stack.v2rhogamma_b_aa_eval_device, total_npts );
+    buffer_adaptor v2rhogamma_b_ab_mem( base_stack.v2rhogamma_b_ab_eval_device, total_npts );
+    buffer_adaptor v2rhogamma_b_bb_mem( base_stack.v2rhogamma_b_bb_eval_device, total_npts );
+    buffer_adaptor v2rholapl_a_a_mem( base_stack.v2rholapl_a_a_eval_device, total_npts );
+    buffer_adaptor v2rholapl_a_b_mem( base_stack.v2rholapl_a_b_eval_device, total_npts );
+    buffer_adaptor v2rholapl_b_a_mem( base_stack.v2rholapl_b_a_eval_device, total_npts );
+    buffer_adaptor v2rholapl_b_b_mem( base_stack.v2rholapl_b_b_eval_device, total_npts );
+    buffer_adaptor v2rhotau_a_a_mem( base_stack.v2rhotau_a_a_eval_device, total_npts );
+    buffer_adaptor v2rhotau_a_b_mem( base_stack.v2rhotau_a_b_eval_device, total_npts );
+    buffer_adaptor v2rhotau_b_a_mem( base_stack.v2rhotau_b_a_eval_device, total_npts );
+    buffer_adaptor v2rhotau_b_b_mem( base_stack.v2rhotau_b_b_eval_device, total_npts );
+    buffer_adaptor v2gamma2_aa_aa_mem( base_stack.v2gamma2_aa_aa_eval_device, total_npts );
+    buffer_adaptor v2gamma2_aa_ab_mem( base_stack.v2gamma2_aa_ab_eval_device, total_npts );
+    buffer_adaptor v2gamma2_aa_bb_mem( base_stack.v2gamma2_aa_bb_eval_device, total_npts );
+    buffer_adaptor v2gamma2_ab_ab_mem( base_stack.v2gamma2_ab_ab_eval_device, total_npts );
+    buffer_adaptor v2gamma2_ab_bb_mem( base_stack.v2gamma2_ab_bb_eval_device, total_npts );
+    buffer_adaptor v2gamma2_bb_bb_mem( base_stack.v2gamma2_bb_bb_eval_device, total_npts );
+    buffer_adaptor v2gammalapl_aa_a_mem( base_stack.v2gammalapl_aa_a_eval_device, total_npts );
+    buffer_adaptor v2gammalapl_aa_b_mem( base_stack.v2gammalapl_aa_b_eval_device, total_npts );
+    buffer_adaptor v2gammalapl_ab_a_mem( base_stack.v2gammalapl_ab_a_eval_device, total_npts );
+    buffer_adaptor v2gammalapl_ab_b_mem( base_stack.v2gammalapl_ab_b_eval_device, total_npts );
+    buffer_adaptor v2gammalapl_bb_a_mem( base_stack.v2gammalapl_bb_a_eval_device, total_npts );
+    buffer_adaptor v2gammalapl_bb_b_mem( base_stack.v2gammalapl_bb_b_eval_device, total_npts );
+    buffer_adaptor v2gammatau_aa_a_mem( base_stack.v2gammatau_aa_a_eval_device, total_npts );
+    buffer_adaptor v2gammatau_aa_b_mem( base_stack.v2gammatau_aa_b_eval_device, total_npts );
+    buffer_adaptor v2gammatau_ab_a_mem( base_stack.v2gammatau_ab_a_eval_device, total_npts );
+    buffer_adaptor v2gammatau_ab_b_mem( base_stack.v2gammatau_ab_b_eval_device, total_npts );
+    buffer_adaptor v2gammatau_bb_a_mem( base_stack.v2gammatau_bb_a_eval_device, total_npts );
+    buffer_adaptor v2gammatau_bb_b_mem( base_stack.v2gammatau_bb_b_eval_device, total_npts );
+    buffer_adaptor v2lapl2_a_a_mem( base_stack.v2lapl2_a_a_eval_device, total_npts );
+    buffer_adaptor v2lapl2_a_b_mem( base_stack.v2lapl2_a_b_eval_device, total_npts );
+    buffer_adaptor v2lapl2_b_b_mem( base_stack.v2lapl2_b_b_eval_device, total_npts );
+    buffer_adaptor v2lapltau_a_a_mem( base_stack.v2lapltau_a_a_eval_device, total_npts );
+    buffer_adaptor v2lapltau_a_b_mem( base_stack.v2lapltau_a_b_eval_device, total_npts );
+    buffer_adaptor v2lapltau_b_a_mem( base_stack.v2lapltau_b_a_eval_device, total_npts );
+    buffer_adaptor v2lapltau_b_b_mem( base_stack.v2lapltau_b_b_eval_device, total_npts );
+    buffer_adaptor v2tau2_a_a_mem( base_stack.v2tau2_a_a_eval_device, total_npts );
+    buffer_adaptor v2tau2_a_b_mem( base_stack.v2tau2_a_b_eval_device, total_npts );
+    buffer_adaptor v2tau2_b_b_mem( base_stack.v2tau2_b_b_eval_device, total_npts );
+
+    // Trial density gradient 
+    buffer_adaptor tdden_sx_mem( base_stack.tdden_sx_eval_device, total_npts );
+    buffer_adaptor tdden_sy_mem( base_stack.tdden_sy_eval_device, total_npts );
+    buffer_adaptor tdden_sz_mem( base_stack.tdden_sz_eval_device, total_npts );
+    buffer_adaptor tdden_zx_mem( base_stack.tdden_zx_eval_device, total_npts );
+    buffer_adaptor tdden_zy_mem( base_stack.tdden_zy_eval_device, total_npts );
+    buffer_adaptor tdden_zz_mem( base_stack.tdden_zz_eval_device, total_npts );
+    buffer_adaptor tdden_yx_mem( base_stack.tdden_yx_eval_device, total_npts );
+    buffer_adaptor tdden_yy_mem( base_stack.tdden_yy_eval_device, total_npts );
+    buffer_adaptor tdden_yz_mem( base_stack.tdden_yz_eval_device, total_npts );
+    buffer_adaptor tdden_xx_mem( base_stack.tdden_xx_eval_device, total_npts );
+    buffer_adaptor tdden_xy_mem( base_stack.tdden_xy_eval_device, total_npts );
+    buffer_adaptor tdden_xz_mem( base_stack.tdden_xz_eval_device, total_npts );
+
+    // Intermediate matrices for contraction
+    buffer_adaptor FXC_A_s_mem(  base_stack.FXC_A_s_eval_device,  total_npts);
+    buffer_adaptor FXC_Bx_s_mem( base_stack.FXC_Bx_s_eval_device, total_npts);
+    buffer_adaptor FXC_By_s_mem( base_stack.FXC_By_s_eval_device, total_npts);
+    buffer_adaptor FXC_Bz_s_mem( base_stack.FXC_Bz_s_eval_device, total_npts);
+    buffer_adaptor FXC_C_s_mem(  base_stack.FXC_C_s_eval_device,  total_npts);
+    buffer_adaptor FXC_A_z_mem(  base_stack.FXC_A_z_eval_device,  total_npts);
+    buffer_adaptor FXC_Bx_z_mem( base_stack.FXC_Bx_z_eval_device, total_npts);
+    buffer_adaptor FXC_By_z_mem( base_stack.FXC_By_z_eval_device, total_npts);
+    buffer_adaptor FXC_Bz_z_mem( base_stack.FXC_Bz_z_eval_device, total_npts);
+    buffer_adaptor FXC_C_z_mem(  base_stack.FXC_C_z_eval_device,  total_npts);
 
     for( auto& task : host_device_tasks ) {
       const auto npts    = task.npts;
@@ -594,6 +742,11 @@ void XCDeviceAoSData::pack_and_send(
       if( reqt.task_bfn_lapl ) {
         task.d2bflapl = d2bf_lapl_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
       }
+      if( reqt.task_bfn_lapgrad ) {
+        task.d3bflapl_x = d3bf_lapgrad_x_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+        task.d3bflapl_y = d3bf_lapgrad_y_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+        task.d3bflapl_z = d3bf_lapgrad_z_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+      }
 
       // X Matrix gradient
       if( reqt.task_xmat_grad ) {
@@ -602,12 +755,27 @@ void XCDeviceAoSData::pack_and_send(
         task.xmat_z = xmat_dz_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
       }
 
+      // Persistent X matrix
+      if( reqt.task_xmat_persist ) {
+        task.xmatS   = xmatS_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+        task.xmatZ   = xmatZ_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+
+        if( reqt.task_xmat_grad ) {
+          task.xmatS_x = xmatS_dx_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+          task.xmatS_y = xmatS_dy_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+          task.xmatS_z = xmatS_dz_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+          task.xmatZ_x = xmatZ_dx_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+          task.xmatZ_y = xmatZ_dy_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+          task.xmatZ_z = xmatZ_dz_mem.aligned_alloc<double>( nbe_bfn * npts, csl);
+        }
+      }
+
 
       // Grid function evaluations
       if (reqt.grid_den) {
         task.den_s        = den_s_mem.aligned_alloc<double>( npts, csl );
         if(is_pol) {
-          task.den          = den_mem.aligned_alloc<double>(npts*2, csl); //Interleaved memory
+          task.den          = den_interleaved_mem.aligned_alloc<double>(npts*2, csl); //Interleaved memory
           task.den_z        = den_z_mem.aligned_alloc<double>( npts, csl);
           if ( is_gks ) {
             task.den_y        = den_y_mem.aligned_alloc<double>( npts, csl);
@@ -616,6 +784,55 @@ void XCDeviceAoSData::pack_and_send(
         }
       }
 
+      if(reqt.grid_den_grad) {
+        task.dden_sx = dden_sx_mem.aligned_alloc<double>(npts, csl);
+        task.dden_sy = dden_sy_mem.aligned_alloc<double>(npts, csl);
+        task.dden_sz = dden_sz_mem.aligned_alloc<double>(npts, csl);
+        if( is_pol ) {
+          task.dden_zx    = dden_zx_mem.aligned_alloc<double>( npts, csl );
+          task.dden_zy    = dden_zy_mem.aligned_alloc<double>( npts, csl );
+          task.dden_zz    = dden_zz_mem.aligned_alloc<double>( npts, csl );
+          if( is_gks ) {
+            task.dden_yx    = dden_yx_mem.aligned_alloc<double>( npts, csl );
+            task.dden_yy    = dden_yy_mem.aligned_alloc<double>( npts, csl );
+            task.dden_yz    = dden_yz_mem.aligned_alloc<double>( npts, csl );
+            task.dden_xx    = dden_xx_mem.aligned_alloc<double>( npts, csl );
+            task.dden_xy    = dden_xy_mem.aligned_alloc<double>( npts, csl );
+            task.dden_xz    = dden_xz_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+      }
+
+      if( reqt.grid_gamma ) {
+        task.gamma = gamma_mem.aligned_alloc<double>( npts*gamma_fac, csl);
+        if( is_pol ) {
+            task.gamma_pp    = gamma_pp_mem.aligned_alloc<double>( npts, csl);
+            task.gamma_pm    = gamma_pm_mem.aligned_alloc<double>( npts, csl);
+            task.gamma_mm    = gamma_mm_mem.aligned_alloc<double>( npts, csl);
+        }
+      }
+
+      if (reqt.grid_tau) {
+        task.tau_s        = tau_s_mem.aligned_alloc<double>( npts, csl );
+        if(is_pol) {
+          task.tau          = tau_interleaved_mem.aligned_alloc<double>(npts*2, csl); //Interleaved memory
+          task.tau_z        = tau_z_mem.aligned_alloc<double>( npts, csl);
+        }
+      }
+
+      if (reqt.grid_lapl) {
+        task.lapl_s        = lapl_s_mem.aligned_alloc<double>( npts, csl );
+        if(is_pol) {
+          task.lapl          = lapl_interleaved_mem.aligned_alloc<double>(npts*2, csl); //Interleaved memory
+          task.lapl_z        = lapl_z_mem.aligned_alloc<double>( npts, csl);
+        }
+      }
+
+
+      
+      if(reqt.grid_eps)
+        task.eps  =   eps_mem.aligned_alloc<double>( reqt.grid_eps_size(npts), csl);
+
       if( reqt.grid_vrho ) {
         task.vrho =   vrho_mem.aligned_alloc<double>( npts*den_fac, csl);
         if( is_pol ) {
@@ -632,33 +849,23 @@ void XCDeviceAoSData::pack_and_send(
             task.vgamma_mm    = vgamma_mm_mem.aligned_alloc<double>( npts, csl);
         }
       }
-      if( reqt.grid_gamma ) {
-        task.gamma = gamma_mem.aligned_alloc<double>( npts*gamma_fac, csl);
+
+      if( reqt.grid_vtau ) {
+        task.vtau =   vtau_mem.aligned_alloc<double>( npts*den_fac, csl);
         if( is_pol ) {
-            task.gamma_pp    = gamma_pp_mem.aligned_alloc<double>( npts, csl);
-            task.gamma_pm    = gamma_pm_mem.aligned_alloc<double>( npts, csl);
-            task.gamma_mm    = gamma_mm_mem.aligned_alloc<double>( npts, csl);
+          task.vtau_pos     = vtau_pos_mem.aligned_alloc<double>( npts, csl);
+          task.vtau_neg     = vtau_neg_mem.aligned_alloc<double>( npts, csl); 
         }
       }
 
-      if(reqt.grid_den_grad) {
-        task.dden_sx = dden_sx_mem.aligned_alloc<double>(npts, csl);
-        task.dden_sy = dden_sy_mem.aligned_alloc<double>(npts, csl);
-        task.dden_sz = dden_sz_mem.aligned_alloc<double>(npts, csl);
+      if( reqt.grid_vlapl ) {
+        task.vlapl =   vlapl_mem.aligned_alloc<double>( npts*den_fac, csl);
         if( is_pol ) {
-          task.dden_zx    = dden_zx_mem.aligned_alloc<double>( npts, csl );
-          task.dden_zy    = dden_zy_mem.aligned_alloc<double>( npts, csl );
-          task.dden_zz    = dden_zz_mem.aligned_alloc<double>( npts, csl );
-          if( is_gks ) {
-            task.dden_yx    = dden_yx_mem.aligned_alloc<double>( npts, csl );
-            task.dden_yy    = dden_yy_mem.aligned_alloc<double>( npts, csl );
-            task.dden_yz    = dden_yz_mem.aligned_alloc<double>( npts, csl );
-            task.dden_xx    = dden_xx_mem.aligned_alloc<double>( npts, csl );
-            task.dden_xy    = dden_xy_mem.aligned_alloc<double>( npts, csl );
-            task.dden_xz    = dden_xz_mem.aligned_alloc<double>( npts, csl );
-          }
+          task.vlapl_pos     = vlapl_pos_mem.aligned_alloc<double>( npts, csl);
+          task.vlapl_neg     = vlapl_neg_mem.aligned_alloc<double>( npts, csl); 
         }
       }
+
       
       // H, K terms (GKS)
       if( is_gks ) {
@@ -671,21 +878,6 @@ void XCDeviceAoSData::pack_and_send(
           task.H_z    = H_z_mem.aligned_alloc<double>( npts, csl );
         }
       }
-      
-      task.eps  =   eps_mem.aligned_alloc<double>( reqt.grid_eps_size(npts), csl);
-
-        
-      if(reqt.grid_den_lapl) {
-        task.denlapl = dden_lapl_mem.aligned_alloc<double>(npts, csl);
-      }
-
-      task.tau = 
-        tau_mem.aligned_alloc<double>( reqt.grid_tau_size(npts), csl);
-
-      task.vtau = 
-        vtau_mem.aligned_alloc<double>( reqt.grid_vtau_size(npts), csl);
-      task.vlapl = 
-        vlapl_mem.aligned_alloc<double>( reqt.grid_vlapl_size(npts), csl);
 
       // EXX Specific
       task.fmat = fmat_mem.aligned_alloc<double>(
@@ -699,6 +891,185 @@ void XCDeviceAoSData::pack_and_send(
           reqt.task_bfn_shell_indirection_size(nbe_bfn), csl
         );
 
+      // Second derivative
+      if( terms.fxc_contraction ) {
+        // Trial density
+        if(reqt.grid_tden) {
+          task.tden_s = tden_s_mem.aligned_alloc<double>( npts, csl );
+          if(is_pol) {
+            task.tden_z = tden_z_mem.aligned_alloc<double>( npts, csl );
+            if(is_gks) {
+              task.tden_y = tden_y_mem.aligned_alloc<double>( npts, csl );
+              task.tden_x = tden_x_mem.aligned_alloc<double>( npts, csl );
+            }
+          }
+        }
+
+        if(reqt.grid_tden_grad) {
+          task.tdden_sx = tdden_sx_mem.aligned_alloc<double>( npts, csl );
+          task.tdden_sy = tdden_sy_mem.aligned_alloc<double>( npts, csl );
+          task.tdden_sz = tdden_sz_mem.aligned_alloc<double>( npts, csl );
+          if(is_pol) {
+            task.tdden_zx = tdden_zx_mem.aligned_alloc<double>( npts, csl );
+            task.tdden_zy = tdden_zy_mem.aligned_alloc<double>( npts, csl );
+            task.tdden_zz = tdden_zz_mem.aligned_alloc<double>( npts, csl );
+            if(is_gks) {
+              task.tdden_yx = tdden_yx_mem.aligned_alloc<double>( npts, csl );
+              task.tdden_yy = tdden_yy_mem.aligned_alloc<double>( npts, csl );
+              task.tdden_yz = tdden_yz_mem.aligned_alloc<double>( npts, csl );
+              task.tdden_xx = tdden_xx_mem.aligned_alloc<double>( npts, csl );
+              task.tdden_xy = tdden_xy_mem.aligned_alloc<double>( npts, csl );
+              task.tdden_xz = tdden_xz_mem.aligned_alloc<double>( npts, csl );
+            }
+          }
+        }
+
+
+        if(reqt.grid_ttau) {
+          task.ttau_s = ttau_s_mem.aligned_alloc<double>( npts, csl );
+          if(is_pol) {
+            task.ttau_z = ttau_z_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_tlapl) {
+          task.tlapl_s = tlapl_s_mem.aligned_alloc<double>( npts, csl );
+          if(is_pol) {
+            task.tlapl_z = tlapl_z_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        // Second derivatives of XC functional
+        if(reqt.grid_v2rho2) {
+          task.v2rho2 = v2rho2_mem.aligned_alloc<double>( npts*rhorho_fac, csl );
+          if(is_pol) {
+            task.v2rho2_a_a = v2rho2_a_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2rho2_a_b = v2rho2_a_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2rho2_b_b = v2rho2_b_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2rhogamma) {
+          task.v2rhogamma = v2rhogamma_mem.aligned_alloc<double>( npts*rhogamma_fac, csl );
+          if(is_pol) {
+            task.v2rhogamma_a_aa = v2rhogamma_a_aa_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhogamma_a_ab = v2rhogamma_a_ab_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhogamma_a_bb = v2rhogamma_a_bb_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhogamma_b_aa = v2rhogamma_b_aa_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhogamma_b_ab = v2rhogamma_b_ab_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhogamma_b_bb = v2rhogamma_b_bb_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2rholapl) {
+          task.v2rholapl = v2rholapl_mem.aligned_alloc<double>( npts*rhotau_fac, csl );
+          if(is_pol) {
+            task.v2rholapl_a_a = v2rholapl_a_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2rholapl_a_b = v2rholapl_a_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2rholapl_b_a = v2rholapl_b_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2rholapl_b_b = v2rholapl_b_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2rhotau) {
+          task.v2rhotau = v2rhotau_mem.aligned_alloc<double>( npts*rhotau_fac, csl );
+          if(is_pol) {
+            task.v2rhotau_a_a = v2rhotau_a_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhotau_a_b = v2rhotau_a_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhotau_b_a = v2rhotau_b_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2rhotau_b_b = v2rhotau_b_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2gamma2) {
+          task.v2gamma2 = v2gamma2_mem.aligned_alloc<double>( npts*rhogamma_fac, csl );
+          if(is_pol) {
+            task.v2gamma2_aa_aa = v2gamma2_aa_aa_mem.aligned_alloc<double>( npts, csl );
+            task.v2gamma2_aa_ab = v2gamma2_aa_ab_mem.aligned_alloc<double>( npts, csl );
+            task.v2gamma2_aa_bb = v2gamma2_aa_bb_mem.aligned_alloc<double>( npts, csl );
+            task.v2gamma2_ab_ab = v2gamma2_ab_ab_mem.aligned_alloc<double>( npts, csl );
+            task.v2gamma2_ab_bb = v2gamma2_ab_bb_mem.aligned_alloc<double>( npts, csl );
+            task.v2gamma2_bb_bb = v2gamma2_bb_bb_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2gammalapl) {
+          task.v2gammalapl = v2gammalapl_mem.aligned_alloc<double>( npts*rhogamma_fac, csl );
+          if(is_pol) {
+            task.v2gammalapl_aa_a = v2gammalapl_aa_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammalapl_aa_b = v2gammalapl_aa_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammalapl_ab_a = v2gammalapl_ab_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammalapl_ab_b = v2gammalapl_ab_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammalapl_bb_a = v2gammalapl_bb_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammalapl_bb_b = v2gammalapl_bb_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2gammatau) {
+          task.v2gammatau = v2gammatau_mem.aligned_alloc<double>( npts*rhogamma_fac, csl );
+          if(is_pol) {
+            task.v2gammatau_aa_a = v2gammatau_aa_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammatau_aa_b = v2gammatau_aa_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammatau_ab_a = v2gammatau_ab_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammatau_ab_b = v2gammatau_ab_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammatau_bb_a = v2gammatau_bb_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2gammatau_bb_b = v2gammatau_bb_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2lapl2) {
+          task.v2lapl2 = v2lapl2_mem.aligned_alloc<double>( npts*rhorho_fac, csl );
+          if(is_pol) {
+            task.v2lapl2_a_a = v2lapl2_a_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2lapl2_a_b = v2lapl2_a_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2lapl2_b_b = v2lapl2_b_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2lapltau) {
+          task.v2lapltau = v2lapltau_mem.aligned_alloc<double>( npts*rhotau_fac, csl );
+          if(is_pol) {
+            task.v2lapltau_a_a = v2lapltau_a_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2lapltau_a_b = v2lapltau_a_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2lapltau_b_a = v2lapltau_b_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2lapltau_b_b = v2lapltau_b_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_v2tau2) {
+          task.v2tau2 = v2tau2_mem.aligned_alloc<double>( npts*rhorho_fac, csl );
+          if(is_pol) {
+            task.v2tau2_a_a = v2tau2_a_a_mem.aligned_alloc<double>( npts, csl );
+            task.v2tau2_a_b = v2tau2_a_b_mem.aligned_alloc<double>( npts, csl );
+            task.v2tau2_b_b = v2tau2_b_b_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        // Intermediate matrices for contraction
+        if(reqt.grid_FXC_A) {
+          task.FXC_A_s = FXC_A_s_mem.aligned_alloc<double>( npts, csl );
+          if (is_pol)
+            task.FXC_A_z = FXC_A_z_mem.aligned_alloc<double>( npts, csl );
+        }
+
+        if(reqt.grid_FXC_B) {
+          task.FXC_Bx_s = FXC_Bx_s_mem.aligned_alloc<double>( npts, csl );
+          task.FXC_By_s = FXC_By_s_mem.aligned_alloc<double>( npts, csl );
+          task.FXC_Bz_s = FXC_Bz_s_mem.aligned_alloc<double>( npts, csl );
+          if (is_pol) {
+            task.FXC_Bx_z = FXC_Bx_z_mem.aligned_alloc<double>( npts, csl );
+            task.FXC_By_z = FXC_By_z_mem.aligned_alloc<double>( npts, csl );
+            task.FXC_Bz_z = FXC_Bz_z_mem.aligned_alloc<double>( npts, csl );
+          }
+        }
+
+        if(reqt.grid_FXC_C) {
+          task.FXC_C_s = FXC_C_s_mem.aligned_alloc<double>( npts, csl );
+          if (is_pol)
+            task.FXC_C_z = FXC_C_z_mem.aligned_alloc<double>( npts, csl );
+        }
+      }
+
     } // Loop over device tasks
 
   } // Setup indirection
diff --git a/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp b/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp
index db399d07..d1c3b782 100644
--- a/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -41,6 +45,9 @@ struct XCDeviceAoSData : public XCDeviceStackData {
     double* d2bf_zz_eval_device = nullptr; ///< 2nd Derivative of `bf_eval_device` wrt z+z
 
     double* d2bf_lapl_eval_device = nullptr; ///< Laplacian of `bf_eval_device`
+    double* d3bf_lapgrad_x_eval_device = nullptr; ///< Laplacian derivative of bf_eval_device wrt x
+    double* d3bf_lapgrad_y_eval_device = nullptr; ///< Laplacian derivative of bf_eval_device wrt y
+    double* d3bf_lapgrad_z_eval_device = nullptr; ///< Laplacian derivative of bf_eval_device wrt z
 
     // VXC Z Matrix
     double* zmat_vxc_device = nullptr;
@@ -51,6 +58,16 @@ struct XCDeviceAoSData : public XCDeviceStackData {
     double* xmat_dy_device = nullptr;
     double* xmat_dz_device = nullptr;
 
+    // Persistent X mat
+    double* xmatS_device    = nullptr;
+    double* xmatS_dx_device = nullptr;
+    double* xmatS_dy_device = nullptr;
+    double* xmatS_dz_device = nullptr;
+    double* xmatZ_device    = nullptr;
+    double* xmatZ_dx_device = nullptr;
+    double* xmatZ_dy_device = nullptr;
+    double* xmatZ_dz_device = nullptr;
+
     // EXX Intermediates
     double* fmat_exx_device = nullptr;
     double* gmat_exx_device = nullptr;
diff --git a/src/xc_integrator/xc_data/device/xc_device_data.hpp b/src/xc_integrator/xc_data/device/xc_device_data.hpp
index fde8158b..781e2372 100644
--- a/src/xc_integrator/xc_data/device/xc_device_data.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_data.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -49,6 +53,7 @@ struct integrator_term_tracker {
   bool exc_grad                  = false;
   bool exx                       = false;
   bool exx_ek_screening          = false;
+  bool fxc_contraction           = false;
   integrator_xc_approx xc_approx = _UNDEF_APPROX;
   integrator_ks_scheme ks_scheme = _UNDEF_SCHEME;
   inline void reset() {
@@ -72,7 +77,7 @@ struct required_term_storage {
   // Evaluation of functions on the grid (linear storage)
   bool grid_den      = false;
   bool grid_den_grad = false;
-  bool grid_den_lapl = false;
+  bool grid_lapl     = false;
   bool grid_gamma    = false;
   bool grid_tau      = false;
   bool grid_eps      = false;
@@ -80,6 +85,25 @@ struct required_term_storage {
   bool grid_vgamma   = false;
   bool grid_vtau     = false;
   bool grid_vlapl    = false;
+  
+  // Second derivative variables
+  bool grid_tden      = false;
+  bool grid_tden_grad = false;
+  bool grid_ttau      = false;
+  bool grid_tlapl     = false;
+  bool grid_v2rho2      = false;
+  bool grid_v2rhogamma  = false;
+  bool grid_v2rholapl   = false;
+  bool grid_v2rhotau    = false;
+  bool grid_v2gamma2    = false;
+  bool grid_v2gammalapl = false;
+  bool grid_v2gammatau  = false;
+  bool grid_v2lapl2     = false;
+  bool grid_v2lapltau   = false;
+  bool grid_v2tau2      = false;
+  bool grid_FXC_A           = false;
+  bool grid_FXC_B           = false;
+  bool grid_FXC_C           = false;
 
 
   // Reference flags for memory management use
@@ -114,11 +138,29 @@ struct required_term_storage {
     }
     return 0ul;
   }
-  inline size_t grid_den_lapl_size(size_t npts){ 
-    return PRDVL(grid_den_lapl, npts);
+  inline size_t grid_lapl_size(size_t npts){ 
+    if(grid_lapl) {
+      switch(ref_tracker.ks_scheme) {
+        case UKS:
+        case GKS:
+          return 4 * npts;
+        default:
+          return npts;
+      }
+    } 
+    return 0ul;
   }
   inline size_t grid_tau_size(size_t npts){ 
-    return PRDVL(grid_tau, npts);
+    if(grid_tau) {
+      switch(ref_tracker.ks_scheme) {
+        case UKS:
+        case GKS:
+          return 4 * npts;
+        default:
+          return npts;
+      }
+    } 
+    return 0ul;
   }
   inline size_t grid_eps_size(size_t npts){ 
     return PRDVL(grid_eps, npts);
@@ -147,10 +189,175 @@ struct required_term_storage {
     return 0ul;
   }
   inline size_t grid_vtau_size(size_t npts){ 
-    return PRDVL(grid_vtau, npts);
+    if(grid_vtau) {
+      switch(ref_tracker.ks_scheme) {
+        case UKS:
+        case GKS:
+          return 4 * npts;
+        default:
+          return npts;
+      }
+    } 
+    return 0ul;
   }
   inline size_t grid_vlapl_size(size_t npts){ 
-    return PRDVL(grid_vlapl, npts);
+    if(grid_vlapl) {
+      switch(ref_tracker.ks_scheme) {
+        case UKS:
+        case GKS:
+          return 4 * npts;
+        default:
+          return npts;
+      }
+    } 
+    return 0ul;
+  }
+  
+  // Size calculators for second derivative variables
+  inline size_t grid_tden_size(size_t npts){ 
+    if( grid_tden ) {
+      if( ref_tracker.ks_scheme == RKS ) return npts; 
+      // 2*npts for S,Z densities, 2*npts for interleaved density
+      if( ref_tracker.ks_scheme == UKS ) return 2*npts;
+      // Same as above, but also X,Y densities
+      if( ref_tracker.ks_scheme == GKS ) return 4*npts;  
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_tden_grad_size(size_t npts){ 
+    if( grid_tden_grad ) {
+      // 3*npts for each density in play
+      if( ref_tracker.ks_scheme == RKS ) return 3*npts;
+      if( ref_tracker.ks_scheme == UKS ) return 6*npts;
+      if( ref_tracker.ks_scheme == GKS ) return 12*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_tlapl_size(size_t npts){ 
+    if(grid_tlapl) {
+      switch(ref_tracker.ks_scheme) {
+        case UKS:
+        case GKS:
+          return 2 * npts;
+        default:
+          return npts;
+      }
+    } 
+    return 0ul;
+  }
+  
+  inline size_t grid_ttau_size(size_t npts){ 
+    if(grid_ttau) {
+      switch(ref_tracker.ks_scheme) {
+        case UKS:
+        case GKS:
+          return 2 * npts;
+        default:
+          return npts;
+      }
+    } 
+    return 0ul;
+  }
+  
+  inline size_t grid_v2rho2_size(size_t npts){
+    if(grid_v2rho2) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2rhogamma_size(size_t npts){
+    if(grid_v2rhogamma) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2rholapl_size(size_t npts){
+    if(grid_v2rholapl) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 8*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2rhotau_size(size_t npts){
+    if(grid_v2rhotau) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 8*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2gamma2_size(size_t npts){
+    if(grid_v2gamma2) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2gammalapl_size(size_t npts){
+    if(grid_v2gammalapl) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2gammatau_size(size_t npts){
+    if(grid_v2gammatau) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2lapl2_size(size_t npts){
+    if(grid_v2lapl2) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2lapltau_size(size_t npts){
+    if(grid_v2lapltau) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 8*npts;
+    }
+    return 0ul;
+  }
+  
+  inline size_t grid_v2tau2_size(size_t npts){
+    if(grid_v2tau2) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts;
+    }
+    return 0ul;
+  }
+
+  inline size_t grid_FXC_A_size(size_t npts){
+    if( grid_FXC_A ) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 2*npts;
+    }
+  }
+  inline size_t grid_FXC_B_size(size_t npts){
+    if( grid_FXC_B ) {
+      if( ref_tracker.ks_scheme == RKS ) return 3*npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts;
+    }
+  }
+  inline size_t grid_FXC_C_size(size_t npts){
+    if( grid_FXC_C ) {
+      if( ref_tracker.ks_scheme == RKS ) return npts;
+      if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 2*npts;
+    }
   }
 
 
@@ -160,9 +367,11 @@ struct required_term_storage {
   bool task_bfn_grad      = false;
   bool task_bfn_hess      = false;
   bool task_bfn_lapl      = false;
+  bool task_bfn_lapgrad   = false;
   bool task_zmat          = false;
   bool task_xmat          = false;
   bool task_xmat_grad     = false;
+  bool task_xmat_persist  = false;
   bool task_fmat          = false;
   bool task_gmat          = false;
   bool task_nbe_scr       = false;
@@ -181,12 +390,19 @@ struct required_term_storage {
   inline size_t task_bfn_lapl_size(size_t nbe, size_t npts) {
     return PRDVL(task_bfn_lapl, nbe * npts);
   }
+  inline size_t task_bfn_lapgrad_size(size_t nbe, size_t npts) {
+    return PRDVL(task_bfn_lapgrad, 3 * nbe * npts);
+  }
   inline size_t task_zmat_size(size_t nbe, size_t npts) {
     return PRDVL(task_zmat, nbe * npts);
   }
   inline size_t task_xmat_grad_size(size_t nbe, size_t npts) {
     return PRDVL(task_xmat_grad, 3 * nbe * npts);
   }
+  inline size_t task_xmat_persist_size(size_t nbe, size_t npts) {
+    // TODO Make this more robust
+    return PRDVL(task_xmat_persist, 2 * (task_xmat_grad ? 4 : 1) * nbe * npts);
+  }
   inline size_t task_fmat_size(size_t nbe, size_t npts) {
     return PRDVL(task_fmat, nbe * npts);
   }
@@ -305,7 +521,8 @@ struct required_term_storage {
     }
 
     // Allocated terms for XC calculations
-    const bool is_xc = tracker.exc_vxc or tracker.exc_grad;
+    const bool is_xc = tracker.exc_vxc or tracker.exc_grad or tracker.fxc_contraction;
+    const bool is_2nd_deriv = tracker.fxc_contraction;
     
     ref_tracker = tracker;
 
@@ -320,10 +537,11 @@ struct required_term_storage {
       const bool need_lapl = tracker.xc_approx == MGGA_LAPL;
       const bool is_mgga = is_xc and (need_tau or need_lapl);
       const bool is_grad = tracker.exc_grad;
+      const bool is_rks  = tracker.ks_scheme == RKS;
 
       grid_den      = true;
       grid_den_grad = is_gga or is_mgga or is_grad;
-      grid_den_lapl = need_lapl;
+      grid_lapl     = need_lapl;
       grid_gamma    = is_gga or is_mgga;
       grid_tau      = is_mgga;
       grid_eps      = true;
@@ -334,11 +552,13 @@ struct required_term_storage {
 
       task_bfn          = true;
       task_bfn_grad     = is_gga or  is_mgga or is_grad;
-      task_bfn_hess     = is_gga and is_grad;
+      task_bfn_hess     = (is_gga or is_mgga) and is_grad;
       task_bfn_lapl     = need_lapl;
+      task_bfn_lapgrad  = need_lapl and is_grad;
       task_zmat         = true;
       task_xmat         = true;
       task_xmat_grad    = is_mgga or (is_gga and is_grad);
+      task_xmat_persist = is_grad and not is_rks;
       task_nbe_scr      = true;
 
       task_submat_cut_bfn   = true;
@@ -350,6 +570,31 @@ struct required_term_storage {
       shell_to_task_bfn   = true;
     }
 
+    if(is_2nd_deriv) {
+      grid_eps      = false;
+
+      grid_tden      = true;
+      grid_tden_grad = true;
+      grid_tlapl     = true;
+      grid_ttau      = true;
+      grid_v2rho2    = true;
+      grid_v2rhogamma= true;
+      grid_v2rholapl = true;
+      grid_v2rhotau  = true;
+      grid_v2gamma2  = true;
+      grid_v2gammalapl= true;
+      grid_v2gammatau= true;
+      grid_v2lapl2   = true;
+      grid_v2lapltau = true;
+      grid_v2tau2    = true;
+      grid_FXC_A         = true;
+      grid_FXC_B         = true;
+      grid_FXC_C         = true;
+
+      // task_bfn_hess     = is_gga or is_mgga or is_grad; // TODO: Check this
+      // task_bfn_lapgrad  = need_lapl and is_grad; // TODO: Check this
+    }
+
     // Density integration
     if(tracker.den) {
       grid_den              = true;
@@ -409,6 +654,7 @@ std::ostream& operator<<( std::ostream& out, const integrator_term_tracker& t )
   out << "  WEIGHTS  " << t.weights << std::endl;
   out << "  DEN      " << t.den << std::endl;
   out << "  EXC_VXC  " << t.exc_vxc << std::endl;
+  out << "  FXC_CONTRACTION " << t.fxc_contraction << std::endl;
   out << "  EXC_GRAD " << t.exc_grad << std::endl;
   out << "  EXX      " << t.exx << std::endl;
   return out;
@@ -432,13 +678,19 @@ struct XCDeviceData {
   virtual void allocate_static_data_weights( int32_t natoms ) = 0;
   virtual void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) = 0;
   virtual void allocate_static_data_den( int32_t nbf, int32_t nshells ) = 0;
-  virtual void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) = 0;
+  virtual void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms, integrator_term_tracker enabled_terms ) = 0;
   virtual void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) = 0;
   virtual void allocate_static_data_exx_ek_screening( size_t ntasks, int32_t nbf, int32_t nshells, int nshell_pairs, int32_t max_l ) = 0;
+  virtual void allocate_static_data_fxc_contraction( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms) = 0;
 
   // Send persistent data from host to device
   virtual void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) = 0;
-  virtual void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet<double>& basis ) = 0;
+  virtual void send_static_data_density_basis( const double* Ps, int32_t ldps, 
+    const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, 
+    const double* Px, int32_t ldpx, const BasisSet<double>& basis ) = 0;
+  virtual void send_static_data_trial_density(
+    const double* tPs, int32_t ldtps, const double* tPz, int32_t ldtpz,
+    const double* tPy, int32_t ldtpy, const double* tPx, int32_t ldtpx ) = 0;
   virtual void send_static_data_shell_pairs( const BasisSet<double>&, const ShellPairCollection<double>& ) = 0;
   virtual void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection<double>& ) = 0;
 
@@ -457,6 +709,9 @@ struct XCDeviceData {
   /// Zero out intermediates for EXX EK screening
   virtual void zero_exx_ek_screening_intermediates() = 0;
 
+  /// Zero out the FXC contraction integrands in device memory
+  virtual void zero_fxc_contraction_integrands() = 0;
+
   /** Generate task batch to execute on device
    *
    *  Generate a batch of XC tasks to execute on the device and 
@@ -487,6 +742,10 @@ struct XCDeviceData {
     double* VXCs, int32_t ldvxcs, double* VXCz, int32_t ldvxcz,
     double* VXCy, int32_t ldvxcy, double* VXCx, int32_t ldvxcx ) = 0;
 
+  virtual void retrieve_fxc_contraction_integrands( double* N_EL,
+    double* FXCs, int32_t ldfxcs, double* FXCz, int32_t ldfxcz,
+    double* FXCy, int32_t ldfxcy, double* FXCx, int32_t ldfxcx ) = 0;
+
   /** Retreive EXC Gradient integrands from device memory
    *
    *  @param[out] EXC_GRAD  Integrated XC Gradient (host) for XC task
@@ -516,6 +775,10 @@ struct XCDeviceData {
   virtual double* exc_device_data() = 0;
   virtual double* nel_device_data() = 0;
   virtual double* exx_k_device_data() = 0;
+  virtual double* fxc_z_device_data() = 0;
+  virtual double* fxc_s_device_data() = 0;
+  virtual double* fxc_y_device_data() = 0;
+  virtual double* fxc_x_device_data() = 0;
   virtual device_queue queue() = 0;
 
 
diff --git a/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp b/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp
index 76119933..3b979c8f 100644
--- a/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx
index 1aadd1ab..96ffb888 100644
--- a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx
+++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -41,6 +45,10 @@ double* XCDeviceStackData::vxc_x_device_data() { return static_stack.vxc_x_devic
 double* XCDeviceStackData::exc_device_data() { return static_stack.exc_device; }
 double* XCDeviceStackData::nel_device_data() { return static_stack.nel_device; }
 double* XCDeviceStackData::exx_k_device_data() { return static_stack.exx_k_device; }
+double* XCDeviceStackData::fxc_s_device_data() { return static_stack.fxc_s_device; }
+double* XCDeviceStackData::fxc_z_device_data() { return static_stack.fxc_z_device; }
+double* XCDeviceStackData::fxc_y_device_data() { return static_stack.fxc_y_device; }
+double* XCDeviceStackData::fxc_x_device_data() { return static_stack.fxc_x_device; }
 
 device_queue XCDeviceStackData::queue() { 
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
@@ -130,6 +138,51 @@ void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshel
 
   allocated_terms.exc_vxc = true;
 }
+
+void XCDeviceStackData::allocate_static_data_fxc_contraction( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms ) {
+
+  if( allocated_terms.fxc_contraction ) 
+    GAUXC_GENERIC_EXCEPTION("Attempting to reallocate Stack FXC Contraction");
+  if( enabled_terms.ks_scheme == _UNDEF_SCHEME )
+    GAUXC_GENERIC_EXCEPTION("Must have a KS Scheme set to allocate Stack EXC VXC");
+
+  // Save state
+  global_dims.nshells = nshells;
+  global_dims.nbf     = nbf; 
+
+  // Allocate static memory with proper alignment
+  buffer_adaptor mem( dynmem_ptr, dynmem_sz );
+
+  static_stack.shells_device     = mem.aligned_alloc<Shell<double>>( nshells , csl);
+  static_stack.nel_device        = mem.aligned_alloc<double>( 1 , csl);
+  static_stack.acc_scr_device    = mem.aligned_alloc<double>( 1 , csl);
+  static_stack.dmat_s_device   = mem.aligned_alloc<double>( nbf * nbf , csl );
+  static_stack.tdmat_s_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+  static_stack.fxc_s_device    = mem.aligned_alloc<double>( nbf * nbf , csl );
+  
+  allocated_terms.ks_scheme = enabled_terms.ks_scheme;
+  if( not (allocated_terms.ks_scheme == RKS) ) {
+      static_stack.dmat_z_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      static_stack.tdmat_z_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      static_stack.fxc_z_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      if( allocated_terms.ks_scheme == GKS ) {
+        static_stack.dmat_y_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.dmat_x_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.tdmat_y_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.tdmat_x_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.fxc_y_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.fxc_x_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      }
+  }
+
+  // Get current stack location
+  dynmem_ptr = mem.stack();
+  dynmem_sz  = mem.nleft(); 
+    
+
+  allocated_terms.fxc_contraction = true;
+}
+
 void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells ) {
 
   if( allocated_terms.den ) 
@@ -155,7 +208,7 @@ void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells )
   allocated_terms.den = true;
 }
 
-void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) {
+void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms, integrator_term_tracker enabled_terms ) {
 
   if( allocated_terms.exc_grad ) 
     GAUXC_GENERIC_EXCEPTION("Attempting to reallocate Stack EXC GRAD");
@@ -173,7 +226,15 @@ void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshe
   static_stack.nel_device        = mem.aligned_alloc<double>( 1 , csl);
   static_stack.acc_scr_device    = mem.aligned_alloc<double>( 1 , csl);
 
-  static_stack.dmat_s_device = mem.aligned_alloc<double>( nbf * nbf , csl);
+  allocated_terms.ks_scheme = enabled_terms.ks_scheme;
+  static_stack.dmat_s_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+  if( not (allocated_terms.ks_scheme == RKS) ) {
+      static_stack.dmat_z_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      if( allocated_terms.ks_scheme == GKS ) {
+        static_stack.dmat_y_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+        static_stack.dmat_x_device  = mem.aligned_alloc<double>( nbf * nbf , csl );
+      }
+  }
 
   // Get current stack location
   dynmem_ptr = mem.stack();
@@ -294,7 +355,7 @@ void XCDeviceStackData::send_static_data_density_basis( const double* Ps, int32_
   if( not is_rks and not is_uks and not is_gks )
     GAUXC_GENERIC_EXCEPTION("Densities do not match RKS, UKS, or GKS schemes");
 
-  if( not (allocated_terms.exx or allocated_terms.exc_vxc or allocated_terms.exc_grad or allocated_terms.den or allocated_terms.exx_ek_screening) ) 
+  if( not (allocated_terms.exx or allocated_terms.exc_vxc or allocated_terms.exc_grad or allocated_terms.den or allocated_terms.exx_ek_screening or allocated_terms.fxc_contraction ) ) 
     GAUXC_GENERIC_EXCEPTION("Density/Basis Not Stack Allocated");
 
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
@@ -323,6 +384,40 @@ void XCDeviceStackData::send_static_data_density_basis( const double* Ps, int32_
 }
 
 
+void XCDeviceStackData::send_static_data_trial_density(
+  const double* tPs, int32_t ldtps, const double* tPz, int32_t ldtpz,
+  const double* tPy, int32_t ldtpy, const double* tPx, int32_t ldtpx ) {
+
+  const bool is_gks = (tPz != nullptr) && (tPy != nullptr) && (tPx != nullptr);
+  const bool is_uks = (tPz != nullptr) && (tPy == nullptr) && (tPx == nullptr);
+  const bool is_rks = (tPs != nullptr) && (not is_uks and not is_gks);
+  if( not is_rks and not is_uks and not is_gks )
+    GAUXC_GENERIC_EXCEPTION("Trial densities do not match RKS, UKS, or GKS schemes");
+
+  if( not allocated_terms.fxc_contraction )
+    GAUXC_GENERIC_EXCEPTION("Trial Density Not Stack Allocated");
+
+  if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
+
+  const auto nbf = global_dims.nbf;
+  // Check dimensions and copy density
+  if( ldtps != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTps must bf NBF");
+  device_backend_->copy_async( nbf*nbf, tPs, static_stack.tdmat_s_device, "tP_scalar H2D" );
+  if( not is_rks ) {
+    if( ldtpz != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTpz must bf NBF");
+    device_backend_->copy_async( nbf*nbf, tPz, static_stack.tdmat_z_device, "tP_z H2D" );
+    if( is_gks ) {
+      if( ldtpy != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTpy must bf NBF");
+      if( ldtpx != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTpx must bf NBF");
+      device_backend_->copy_async( nbf*nbf, tPy, static_stack.tdmat_y_device, "tP_y H2D" );
+      device_backend_->copy_async( nbf*nbf, tPx, static_stack.tdmat_x_device, "tP_x H2D" );
+    }
+  }
+  
+  device_backend_->master_queue_synchronize();
+}
+
+
 void XCDeviceStackData::send_static_data_shell_pairs( 
   const BasisSet<double>& basis,
   const ShellPairCollection<double>& shell_pairs ) {
@@ -475,6 +570,19 @@ void XCDeviceStackData::zero_exc_vxc_integrands(integrator_term_tracker enabled_
 
 }
 
+void XCDeviceStackData::zero_fxc_contraction_integrands() {
+
+  if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
+
+  const auto nbf = global_dims.nbf;
+  if(static_stack.fxc_s_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_s_device, "FXCs Zero" );
+  if(static_stack.fxc_z_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_z_device, "FXCz Zero" );
+  if(static_stack.fxc_y_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_y_device, "FXCy Zero" );
+  if(static_stack.fxc_x_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_x_device, "FXCx Zero" );
+  device_backend_->set_zero( 1,       static_stack.nel_device, "NEL Zero" );
+
+}
+
 void XCDeviceStackData::zero_exc_grad_integrands() {
 
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
@@ -533,6 +641,31 @@ void XCDeviceStackData::retrieve_exc_vxc_integrands( double* EXC, double* N_EL,
 
 }
 
+void XCDeviceStackData::retrieve_fxc_contraction_integrands( double* N_EL,
+  double* FXCs, int32_t ldfxcs, double* FXCz, int32_t ldfxcz,
+  double* FXCy, int32_t ldfxcy, double* FXCx, int32_t ldfxcx ) {
+
+  const auto nbf = global_dims.nbf;
+  device_backend_->copy_async( 1,       static_stack.nel_device, N_EL, "NEL D2H" );
+
+  if( ldfxcs and (ldfxcs != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCs must be NBF");
+  if( FXCs )
+    device_backend_->copy_async( nbf*nbf, static_stack.fxc_s_device, FXCs,  "FXCs D2H" );
+
+  if( ldfxcz and (ldfxcz != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCz must be NBF");
+  if( FXCz )
+    device_backend_->copy_async( nbf*nbf, static_stack.fxc_z_device, FXCz,  "FXCz D2H" );
+
+  if( ldfxcy and (ldfxcy != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCy must be NBF");
+  if( FXCy )
+    device_backend_->copy_async( nbf*nbf, static_stack.fxc_y_device, FXCy,  "FXCy D2H" );
+
+  if( ldfxcx and (ldfxcx != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCx must be NBF");
+  if( FXCx )
+    device_backend_->copy_async( nbf*nbf, static_stack.fxc_x_device, FXCx,  "FXCx D2H" );
+
+}
+
 void XCDeviceStackData::retrieve_den_integrands( double* N_EL ) {
 
   if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend");
@@ -639,7 +772,7 @@ size_t XCDeviceStackData::get_mem_req(
     // U Variables
     reqt.grid_den_size(npts)      * sizeof(double) + 
     reqt.grid_den_grad_size(npts) * sizeof(double) +
-    reqt.grid_den_lapl_size(npts) * sizeof(double) +
+    reqt.grid_lapl_size(npts)     * sizeof(double) +
 
     // H/K Matrices (GKS)
     reqt.grid_HK_size(npts)       * sizeof(double) +
@@ -655,6 +788,29 @@ size_t XCDeviceStackData::get_mem_req(
     reqt.grid_vtau_size(npts)     * sizeof(double) +
     reqt.grid_vlapl_size(npts)    * sizeof(double) ;
 
+    // second derivatives
+    mem_req += 
+    // U variables
+    reqt.grid_tden_size(npts)      * sizeof(double) +
+    reqt.grid_tden_grad_size(npts) * sizeof(double) +
+    reqt.grid_tlapl_size(npts)     * sizeof(double) +
+    reqt.grid_ttau_size(npts)      * sizeof(double) +
+    // XC output
+    reqt.grid_v2rho2_size(npts)    * sizeof(double) +
+    reqt.grid_v2rhogamma_size(npts)  * sizeof(double) +
+    reqt.grid_v2rholapl_size(npts)   * sizeof(double) +
+    reqt.grid_v2rhotau_size(npts)  * sizeof(double) +
+    reqt.grid_v2gamma2_size(npts) * sizeof(double) +
+    reqt.grid_v2gammalapl_size(npts) * sizeof(double) +
+    reqt.grid_v2gammatau_size(npts) * sizeof(double) +
+    reqt.grid_v2lapl2_size(npts) * sizeof(double) +
+    reqt.grid_v2lapltau_size(npts) * sizeof(double) +
+    reqt.grid_v2tau2_size(npts) * sizeof(double) +
+    // intermediate output
+    reqt.grid_FXC_A_size(npts) * sizeof(double) +
+    reqt.grid_FXC_B_size(npts) * sizeof(double) +
+    reqt.grid_FXC_C_size(npts) * sizeof(double);
+
   return mem_req;
 }
 
@@ -708,60 +864,85 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack(
 
   // Grid function evaluations
   if( reqt.grid_den ) { // Density 
-    base_stack.den_s_eval_device     = mem.aligned_alloc<double>(msz, aln, csl);
-    if( is_pol )  {  base_stack.den_eval_device       = mem.aligned_alloc<double>(2*msz, aln, csl);
-                    base_stack.den_z_eval_device     = mem.aligned_alloc<double>(msz, aln, csl); 
-    if( is_gks ){   base_stack.den_y_eval_device     = mem.aligned_alloc<double>(msz, aln, csl); 
-                    base_stack.den_x_eval_device     = mem.aligned_alloc<double>(msz, aln, csl); }
+    base_stack.den_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+
+    if(is_pol) {
+      base_stack.den_interleaved_device = mem.aligned_alloc<double>(2*msz, aln, csl);
+      base_stack.den_z_eval_device      = mem.aligned_alloc<double>(msz, aln, csl);
+    }
+
+    if(is_gks){   
+      base_stack.den_y_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      base_stack.den_x_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
     }
   }
 
   if( reqt.grid_den_grad ) { // Density gradient
-                   base_stack.dden_sx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                   base_stack.dden_sy_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                   base_stack.dden_sz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
-
-    if( is_pol )  { base_stack.dden_zx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                   base_stack.dden_zy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
-                   base_stack.dden_zz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
-    if( is_gks ) { base_stack.dden_yx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                   base_stack.dden_yy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
-                   base_stack.dden_yz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
-                   base_stack.dden_xx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                   base_stack.dden_xy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
-                   base_stack.dden_xz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
+    base_stack.dden_sx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    base_stack.dden_sy_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    base_stack.dden_sz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+
+    if(is_pol) { 
+      base_stack.dden_zx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.dden_zy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      base_stack.dden_zz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    }
+    if( is_gks ) { 
+      base_stack.dden_yx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.dden_yy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      base_stack.dden_yz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      base_stack.dden_xx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.dden_xy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      base_stack.dden_xz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
     }
   }
 
-  if( reqt.grid_den_lapl ) { // Density Laplacian
-    base_stack.den_lapl_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+  if( reqt.grid_tau ) { // Tau 
+    base_stack.tau_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if(is_pol) {
+      base_stack.tau_interleaved_device = mem.aligned_alloc<double>(2*msz, aln, csl);
+      base_stack.tau_z_eval_device      = mem.aligned_alloc<double>(msz, aln, csl);
+    } 
+  }
+
+  if( reqt.grid_lapl ) { // Density Laplacian
+    base_stack.lapl_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if(is_pol) {
+      base_stack.lapl_interleaved_device = mem.aligned_alloc<double>(2*msz, aln, csl);
+      base_stack.lapl_z_eval_device      = mem.aligned_alloc<double>(msz, aln, csl);
+    } 
   }
 
   if( reqt.grid_gamma ) { // Gamma
-    if( is_pol  ) {  base_stack.gamma_eval_device    = mem.aligned_alloc<double>(3 * msz, aln, csl);
-                    base_stack.gamma_pp_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                    base_stack.gamma_pm_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                    base_stack.gamma_mm_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
-    else            base_stack.gamma_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol  ) {  
+      base_stack.gamma_eval_device    = mem.aligned_alloc<double>(3 * msz, aln, csl);
+      base_stack.gamma_pp_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.gamma_pm_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.gamma_mm_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    } else {           
+      base_stack.gamma_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    }
   }
 
   if( reqt.grid_vrho ) { // Vrho
-    if( is_pol  ) { base_stack.vrho_eval_device = mem.aligned_alloc<double>(2 * msz, aln, csl);
-                   base_stack.vrho_pos_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                   base_stack.vrho_neg_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
-    else           base_stack.vrho_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol  ) { 
+      base_stack.vrho_eval_device     = mem.aligned_alloc<double>(2 * msz, aln, csl);
+      base_stack.vrho_pos_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.vrho_neg_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    } else {          
+      base_stack.vrho_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    }
   }
 
   if( reqt.grid_vgamma ) { // Vgamma
-    if( is_pol  ) {  base_stack.vgamma_eval_device    = mem.aligned_alloc<double>(3*msz, aln, csl);
-                    base_stack.vgamma_pp_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                    base_stack.vgamma_pm_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
-                    base_stack.vgamma_mm_eval_device = mem.aligned_alloc<double>(msz, aln, csl); }
-    else            base_stack.vgamma_eval_device    = mem.aligned_alloc<double>(msz, aln, csl);
-  }
-
-  if( reqt.grid_tau ) { // Tau 
-    base_stack.tau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol  ) {  
+      base_stack.vgamma_eval_device    = mem.aligned_alloc<double>(3*msz, aln, csl);
+      base_stack.vgamma_pp_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.vgamma_pm_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.vgamma_mm_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    } else {
+      base_stack.vgamma_eval_device    = mem.aligned_alloc<double>(msz, aln, csl);
+    }
   }
 
   if( is_gks ) {       // H, K matrices
@@ -780,11 +961,224 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack(
   }
 
   if( reqt.grid_vtau ) { // Vtau
-    base_stack.vtau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol  ) { 
+      base_stack.vtau_eval_device     = mem.aligned_alloc<double>(2 * msz, aln, csl);
+      base_stack.vtau_pos_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.vtau_neg_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    } else {          
+      base_stack.vtau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    }
   }
 
   if( reqt.grid_vlapl ) { // Vlapl
-    base_stack.vlapl_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    if( is_pol  ) { 
+      base_stack.vlapl_eval_device     = mem.aligned_alloc<double>(2 * msz, aln, csl);
+      base_stack.vlapl_pos_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.vlapl_neg_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+    } else {          
+      base_stack.vlapl_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    }
+  }
+
+  if( terms.fxc_contraction ) {
+    // Trial density evaluation
+    if( reqt.grid_tden ) { 
+      base_stack.tden_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      if(is_pol) {
+        base_stack.tden_z_eval_device      = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+      if(is_gks){
+        base_stack.tden_y_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+        base_stack.tden_x_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      }
+    }
+
+    // Trial density gradient
+    if( reqt.grid_tden_grad ) {
+      base_stack.tdden_sx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.tdden_sy_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.tdden_sz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+
+      if(is_pol) { 
+        base_stack.tdden_zx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.tdden_zy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+        base_stack.tdden_zz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      }
+      if( is_gks ) { 
+        base_stack.tdden_yx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.tdden_yy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+        base_stack.tdden_yz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+        base_stack.tdden_xx_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.tdden_xy_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+        base_stack.tdden_xz_eval_device = mem.aligned_alloc<double>(msz, aln, csl); 
+      }
+    }
+
+    // Trial tau
+    if( reqt.grid_ttau ) {
+      base_stack.ttau_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      if(is_pol) {
+        base_stack.ttau_z_eval_device      = mem.aligned_alloc<double>(msz, aln, csl);
+      } 
+    }
+
+    // Trial laplacian
+    if( reqt.grid_tlapl ) {
+      base_stack.tlapl_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      if(is_pol) {
+        base_stack.tlapl_z_eval_device      = mem.aligned_alloc<double>(msz, aln, csl);
+      } 
+    }
+
+    // Second derivatives of XC functional
+    if( reqt.grid_v2rho2 ) {
+      if( is_pol  ) { 
+        base_stack.v2rho2_eval_device = mem.aligned_alloc<double>(3 * msz, aln, csl);
+        base_stack.v2rho2_a_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rho2_a_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rho2_b_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2rho2_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2rhogamma ) {
+      if( is_pol  ) { 
+        base_stack.v2rhogamma_eval_device = mem.aligned_alloc<double>(6 * msz, aln, csl);
+        base_stack.v2rhogamma_a_aa_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhogamma_a_ab_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhogamma_a_bb_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhogamma_b_aa_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhogamma_b_ab_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhogamma_b_bb_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2rhogamma_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2rholapl ) {
+      if( is_pol  ) { 
+        base_stack.v2rholapl_eval_device = mem.aligned_alloc<double>(4 * msz, aln, csl);
+        base_stack.v2rholapl_a_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rholapl_a_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rholapl_b_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rholapl_b_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2rholapl_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2rhotau ) {
+      if( is_pol  ) { 
+        base_stack.v2rhotau_eval_device = mem.aligned_alloc<double>(4 * msz, aln, csl);
+        base_stack.v2rhotau_a_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhotau_a_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhotau_b_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2rhotau_b_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2rhotau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2gamma2 ) {
+      if( is_pol  ) { 
+        base_stack.v2gamma2_eval_device = mem.aligned_alloc<double>(6 * msz, aln, csl);
+        base_stack.v2gamma2_aa_aa_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gamma2_aa_ab_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gamma2_aa_bb_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gamma2_ab_ab_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gamma2_ab_bb_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gamma2_bb_bb_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2gamma2_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2gammalapl ) {
+      if( is_pol  ) { 
+        base_stack.v2gammalapl_eval_device = mem.aligned_alloc<double>(6 * msz, aln, csl);
+        base_stack.v2gammalapl_aa_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammalapl_aa_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammalapl_ab_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammalapl_ab_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammalapl_bb_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammalapl_bb_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2gammalapl_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2gammatau ) {
+      if( is_pol  ) { 
+        base_stack.v2gammatau_eval_device = mem.aligned_alloc<double>(6 * msz, aln, csl);
+        base_stack.v2gammatau_aa_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammatau_aa_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammatau_ab_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammatau_ab_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammatau_bb_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2gammatau_bb_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2gammatau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2lapl2 ) {
+      if( is_pol  ) { 
+        base_stack.v2lapl2_eval_device = mem.aligned_alloc<double>(3 * msz, aln, csl);
+        base_stack.v2lapl2_a_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2lapl2_a_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2lapl2_b_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2lapl2_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2lapltau ) {
+      if( is_pol  ) { 
+        base_stack.v2lapltau_eval_device = mem.aligned_alloc<double>(4 * msz, aln, csl);
+        base_stack.v2lapltau_a_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2lapltau_a_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2lapltau_b_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2lapltau_b_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2lapltau_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_v2tau2 ) {
+      if( is_pol  ) { 
+        base_stack.v2tau2_eval_device = mem.aligned_alloc<double>(3 * msz, aln, csl);
+        base_stack.v2tau2_a_a_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2tau2_a_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.v2tau2_b_b_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      } else {          
+        base_stack.v2tau2_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    // Intermediate matrices for contraction
+    if( reqt.grid_FXC_A ) {
+      base_stack.FXC_A_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      if( is_pol  ) 
+        base_stack.FXC_A_z_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    }
+    
+    if( reqt.grid_FXC_B ) {
+      base_stack.FXC_Bx_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.FXC_By_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      base_stack.FXC_Bz_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      if( is_pol  ) { 
+        base_stack.FXC_Bx_z_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.FXC_By_z_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+        base_stack.FXC_Bz_z_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      }
+    }
+
+    if( reqt.grid_FXC_C ) {
+      base_stack.FXC_C_s_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+      if( is_pol  ) 
+        base_stack.FXC_C_z_eval_device = mem.aligned_alloc<double>(msz, aln, csl);
+    }
   }
 
 
diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp
index e1a72ec4..cf5399a8 100644
--- a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -55,6 +59,7 @@ struct XCDeviceStackData : public XCDeviceData {
     double* exx_k_device   = nullptr;  ///< EXX K storage (nbf,nbf)
     double* acc_scr_device = nullptr;  ///< Accumulaion scratch (1)
     double* exc_grad_device = nullptr; ///< EXC Gradient storage (3*natoms)
+    double* fxc_device     = nullptr; ///< FXC contraction storage (nbf,nbf)
 
     double* vshell_max_sparse_device = nullptr;
     size_t* shpair_row_ind_device = nullptr;
@@ -72,8 +77,63 @@ struct XCDeviceStackData : public XCDeviceData {
     double* vxc_z_device    = nullptr;  /// Ditto for Z,Y,X densities
     double* vxc_y_device    = nullptr;
     double* vxc_x_device    = nullptr;
+    
+    // Second derivatives
+    double* tdmat_s_device  = nullptr;  ///< Static trial density matrix storage (nbf,nbf)
+    double* tdmat_z_device  = nullptr;  /// Ditto for Z,Y,X trial densities
+    double* tdmat_y_device  = nullptr;
+    double* tdmat_x_device  = nullptr;
+    double* fxc_s_device    = nullptr;  ///< FXC storage (nbf, nbf)
+    double* fxc_z_device    = nullptr;  /// Ditto for Z,Y,X densities
+    double* fxc_y_device    = nullptr;
+    double* fxc_x_device    = nullptr;
 
     inline void reset() { std::memset( this, 0, sizeof(static_data) ); }
+
+    inline double* den_selector(density_id den) {
+      switch(den) {
+        case DEN_S: return dmat_s_device;
+        case DEN_Z: return dmat_z_device;
+        case DEN_Y: return dmat_y_device;
+        case DEN_X: return dmat_x_device;
+        default: GAUXC_GENERIC_EXCEPTION("den_selector: density_id not recognized");
+      } 
+      return nullptr;
+    }
+
+    inline double* vxc_selector(density_id den) {
+      switch(den) {
+        case DEN_S: return vxc_s_device;
+        case DEN_Z: return vxc_z_device;
+        case DEN_Y: return vxc_y_device;
+        case DEN_X: return vxc_x_device;
+        default: GAUXC_GENERIC_EXCEPTION("vxc_selector: density_id not recognized");
+      } 
+      return nullptr;
+    }
+
+    inline double* tden_selector(density_id den) {
+      switch(den) {
+        case DEN_S: return tdmat_s_device;
+        case DEN_Z: return tdmat_z_device;
+        case DEN_Y: return tdmat_y_device;
+        case DEN_X: return tdmat_x_device;
+        default: GAUXC_GENERIC_EXCEPTION("tden_selector: density_id not recognized");
+      } 
+      return nullptr;
+    }
+
+    inline double* fxc_selector(density_id den) {
+      switch(den) {
+        case DEN_S: return fxc_s_device;
+        case DEN_Z: return fxc_z_device;
+        case DEN_Y: return fxc_y_device;
+        case DEN_X: return fxc_x_device;
+        default: GAUXC_GENERIC_EXCEPTION("fxc_selector: density_id not recognized");
+      } 
+      return nullptr;
+    }
+
   };
 
   XCDeviceShellPairSoA shell_pair_soa;
@@ -92,15 +152,19 @@ struct XCDeviceStackData : public XCDeviceData {
     double* weights_device = nullptr; ///< Grid weights for task batch
 
     // U variables
-    double* den_s_eval_device   = nullptr; ///< scalar density for task batch
-    double* dden_sx_eval_device = nullptr; ///< d/dx scalar density for task batch
-    double* dden_sy_eval_device = nullptr; ///< d/dy scalar density for task batch
-    double* dden_sz_eval_device = nullptr; ///< d/dz scalar density for task batch
+    double* den_s_eval_device      = nullptr; ///< scalar density for task batch
+    double* dden_sx_eval_device    = nullptr; ///< d/dx scalar density for task batch
+    double* dden_sy_eval_device    = nullptr; ///< d/dy scalar density for task batch
+    double* dden_sz_eval_device    = nullptr; ///< d/dz scalar density for task batch
+    double* tau_s_eval_device      = nullptr; ///< scalar tau for task batch
+    double* lapl_s_eval_device     = nullptr; ///< scalar density laplacian for task batch
     
-    double* den_z_eval_device   = nullptr; ///< z density for task batch
-    double* dden_zx_eval_device = nullptr; ///< d/dx z density for task batch
-    double* dden_zy_eval_device = nullptr; ///< d/dy z density for task batch
-    double* dden_zz_eval_device = nullptr; ///< d/dz z density for task batch
+    double* den_z_eval_device      = nullptr; ///< z density for task batch
+    double* dden_zx_eval_device    = nullptr; ///< d/dx z density for task batch
+    double* dden_zy_eval_device    = nullptr; ///< d/dy z density for task batch
+    double* dden_zz_eval_device    = nullptr; ///< d/dz z density for task batch
+    double* tau_z_eval_device      = nullptr; ///< z tau for task batch
+    double* lapl_z_eval_device     = nullptr; ///< z density laplacian for task batch
 
     double* den_y_eval_device   = nullptr; ///< y density for task batch
     double* dden_yx_eval_device = nullptr; ///< d/dx y density for task batch
@@ -112,21 +176,25 @@ struct XCDeviceStackData : public XCDeviceData {
     double* dden_xy_eval_device = nullptr; ///< d/dy x density for task batch
     double* dden_xz_eval_device = nullptr; ///< d/dz x density for task batch
     
-    double* den_eval_device     = nullptr; /// Storage for interleaved density (non-RKS only)
-
-    double* den_lapl_eval_device = nullptr; ///< density Laplacian for task batch
+    double* den_interleaved_device  = nullptr; /// Storage for interleaved density (non-RKS only)
+    double* tau_interleaved_device  = nullptr; /// Storage for interleaved tau (non-RKS only)
+    double* lapl_interleaved_device = nullptr; /// Storage for interleaved lapl (non-RKS only)
 
     // V variables / XC output
     double* gamma_eval_device  = nullptr; ///< gamma for task batch
-    double* tau_eval_device    = nullptr; ///< tau for task batch
     double* eps_eval_device    = nullptr; ///< XC energy density for task batch
     double* vrho_eval_device   = nullptr; ///< Rho XC derivative for task batch
     double* vgamma_eval_device = nullptr; ///< Gamma XC derivative for task batch
     double* vtau_eval_device   = nullptr; ///< Tau XC derivative for task batch
     double* vlapl_eval_device  = nullptr; ///< Lapl XC derivative for task batch
 
-    double* vrho_pos_eval_device  = nullptr;  ///< Polarized Rho+ XC derivative for task batch
-    double* vrho_neg_eval_device  = nullptr;  ///< Polarized Rho+ XC derivative for task batch
+    double* vrho_pos_eval_device   = nullptr;  ///< Polarized Rho+ XC derivative for task batch
+    double* vrho_neg_eval_device   = nullptr;  ///< Polarized Rho+ XC derivative for task batch
+    double* vtau_pos_eval_device   = nullptr;
+    double* vtau_neg_eval_device   = nullptr;
+    double* vlapl_pos_eval_device  = nullptr;
+    double* vlapl_neg_eval_device  = nullptr;
+    
 
     double* gamma_pp_eval_device  = nullptr;  ///< Polarized Gamma++ for task batch
     double* gamma_pm_eval_device  = nullptr;  ///< Polarized Gamma+- for task batch
@@ -142,6 +210,101 @@ struct XCDeviceStackData : public XCDeviceData {
     double* K_y_eval_device     = nullptr;    ///< norm(m) dependent LDA Y transformation factor for task batch
     double* K_z_eval_device     = nullptr;    ///< norm(m) dependent LDA Z transformation factor for task batch
 
+    // Second derivative intermediates - Trial variables (T)
+    double* tden_s_eval_device      = nullptr; ///< scalar trial density for task batch
+    double* tdden_sx_eval_device    = nullptr; ///< d/dx scalar trial density for task batch
+    double* tdden_sy_eval_device    = nullptr; ///< d/dy scalar trial density for task batch
+    double* tdden_sz_eval_device    = nullptr; ///< d/dz scalar trial density for task batch
+    double* ttau_s_eval_device      = nullptr; ///< scalar trial tau for task batch
+    double* tlapl_s_eval_device     = nullptr; ///< scalar trial density laplacian for task batch
+    
+    double* tden_z_eval_device      = nullptr; ///< z trial density for task batch
+    double* tdden_zx_eval_device    = nullptr; ///< d/dx z trial density for task batch
+    double* tdden_zy_eval_device    = nullptr; ///< d/dy z trial density for task batch
+    double* tdden_zz_eval_device    = nullptr; ///< d/dz z trial density for task batch
+    double* ttau_z_eval_device      = nullptr; ///< z trial tau for task batch
+    double* tlapl_z_eval_device     = nullptr; ///< z trial density laplacian for task batch
+
+    double* tden_y_eval_device      = nullptr; ///< y trial density for task batch
+    double* tdden_yx_eval_device    = nullptr; ///< d/dx y trial density for task batch
+    double* tdden_yy_eval_device    = nullptr; ///< d/dy y trial density for task batch
+    double* tdden_yz_eval_device    = nullptr; ///< d/dz y trial density for task batch
+
+    double* tden_x_eval_device      = nullptr; ///< x trial density for task batch
+    double* tdden_xx_eval_device    = nullptr; ///< d/dx x trial density for task batch
+    double* tdden_xy_eval_device    = nullptr; ///< d/dy x trial density for task batch
+    double* tdden_xz_eval_device    = nullptr; ///< d/dz x trial density for task batch
+
+    // Second derivative kernel outputs (V2 variables)
+    double* v2rho2_eval_device      = nullptr; ///< 2nd derivative of XC wrt rho^2
+    double* v2rhogamma_eval_device  = nullptr; ///< 2nd derivative of XC wrt rho-gamma
+    double* v2rholapl_eval_device   = nullptr; ///< 2nd derivative of XC wrt rho-lapl
+    double* v2rhotau_eval_device    = nullptr; ///< 2nd derivative of XC wrt rho-tau
+    double* v2gamma2_eval_device    = nullptr; ///< 2nd derivative of XC wrt gamma^2
+    double* v2gammalapl_eval_device = nullptr; ///< 2nd derivative of XC wrt gamma-lapl
+    double* v2gammatau_eval_device  = nullptr; ///< 2nd derivative of XC wrt gamma-tau
+    double* v2lapl2_eval_device     = nullptr; ///< 2nd derivative of XC wrt lapl^2
+    double* v2lapltau_eval_device   = nullptr; ///< 2nd derivative of XC wrt lapl-tau
+    double* v2tau2_eval_device      = nullptr; ///< 2nd derivative of XC wrt tau^2
+    // in unrestricted case, these are 2nd derivatives of XC with alpha (+) and beta (-) densities
+    double* v2rho2_a_a_eval_device = nullptr;
+    double* v2rho2_a_b_eval_device = nullptr;
+    double* v2rho2_b_b_eval_device = nullptr;
+    double* v2rhogamma_a_aa_eval_device = nullptr;
+    double* v2rhogamma_a_ab_eval_device = nullptr;
+    double* v2rhogamma_a_bb_eval_device = nullptr;
+    double* v2rhogamma_b_aa_eval_device = nullptr;
+    double* v2rhogamma_b_ab_eval_device = nullptr;
+    double* v2rhogamma_b_bb_eval_device = nullptr;
+    double* v2rholapl_a_a_eval_device = nullptr;
+    double* v2rholapl_a_b_eval_device = nullptr;
+    double* v2rholapl_b_a_eval_device = nullptr;
+    double* v2rholapl_b_b_eval_device = nullptr;
+    double* v2rhotau_a_a_eval_device = nullptr;
+    double* v2rhotau_a_b_eval_device = nullptr;
+    double* v2rhotau_b_a_eval_device = nullptr;
+    double* v2rhotau_b_b_eval_device = nullptr;
+    double* v2gamma2_aa_aa_eval_device = nullptr;
+    double* v2gamma2_aa_ab_eval_device = nullptr;
+    double* v2gamma2_aa_bb_eval_device = nullptr;
+    double* v2gamma2_ab_ab_eval_device = nullptr;
+    double* v2gamma2_ab_bb_eval_device = nullptr;
+    double* v2gamma2_bb_bb_eval_device = nullptr;
+    double* v2gammalapl_aa_a_eval_device = nullptr;
+    double* v2gammalapl_aa_b_eval_device = nullptr;
+    double* v2gammalapl_ab_a_eval_device = nullptr;
+    double* v2gammalapl_ab_b_eval_device = nullptr;
+    double* v2gammalapl_bb_a_eval_device = nullptr;
+    double* v2gammalapl_bb_b_eval_device = nullptr;
+    double* v2gammatau_aa_a_eval_device = nullptr;
+    double* v2gammatau_aa_b_eval_device = nullptr;
+    double* v2gammatau_ab_a_eval_device = nullptr;
+    double* v2gammatau_ab_b_eval_device = nullptr;
+    double* v2gammatau_bb_a_eval_device = nullptr;
+    double* v2gammatau_bb_b_eval_device = nullptr;
+    double* v2lapl2_a_a_eval_device = nullptr;
+    double* v2lapl2_a_b_eval_device = nullptr;
+    double* v2lapl2_b_b_eval_device = nullptr;
+    double* v2lapltau_a_a_eval_device = nullptr;
+    double* v2lapltau_a_b_eval_device = nullptr;
+    double* v2lapltau_b_a_eval_device = nullptr;
+    double* v2lapltau_b_b_eval_device = nullptr;
+    double* v2tau2_a_a_eval_device = nullptr;
+    double* v2tau2_a_b_eval_device = nullptr;
+    double* v2tau2_b_b_eval_device = nullptr;
+    
+    // Second derivative kernel outputs (A,B,C variables)
+    double* FXC_A_s_eval_device           = nullptr;
+    double* FXC_Bx_s_eval_device          = nullptr;
+    double* FXC_By_s_eval_device          = nullptr;
+    double* FXC_Bz_s_eval_device          = nullptr;
+    double* FXC_C_s_eval_device           = nullptr;
+    double* FXC_A_z_eval_device           = nullptr;
+    double* FXC_Bx_z_eval_device          = nullptr;
+    double* FXC_By_z_eval_device          = nullptr;
+    double* FXC_Bz_z_eval_device          = nullptr;
+    double* FXC_C_z_eval_device           = nullptr;
+
     inline void reset() { std::memset( this, 0, sizeof(base_stack_data) ); }
   };
 
@@ -161,25 +324,33 @@ struct XCDeviceStackData : public XCDeviceData {
     host_task_iterator, host_task_iterator) override final;
   void allocate_static_data_weights( int32_t natoms ) override final;
   void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) override final;
+  void allocate_static_data_fxc_contraction( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms ) override final;
   void allocate_static_data_den( int32_t nbf, int32_t nshells ) override final;
-  void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) override final;
+  void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms, integrator_term_tracker enabled_terms ) override final;
   void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) override final;
   void allocate_static_data_exx_ek_screening( size_t ntasks, int32_t nbf, int32_t nshells, int nshell_pairs, int32_t max_l ) override final;
   void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) override final;
   void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz,
                                         const double* Py, int32_t ldpy, const double* Px, int32_t ldpx,
     const BasisSet<double>& basis ) override final;
+  void send_static_data_trial_density(
+    const double* tPs, int32_t ldtps, const double* tPz, int32_t ldtpz,
+    const double* tPy, int32_t ldtpy, const double* tPx, int32_t ldtpx ) override final;
   void send_static_data_shell_pairs( const BasisSet<double>&, const ShellPairCollection<double>& ) 
     override final;
   void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection<double>& ) override final;
   void zero_den_integrands() override final;
   void zero_exc_vxc_integrands(integrator_term_tracker t) override final;
+  void zero_fxc_contraction_integrands() override final;
   void zero_exc_grad_integrands() override final;
   void zero_exx_integrands() override final;
   void zero_exx_ek_screening_intermediates() override final;
   void retrieve_exc_vxc_integrands( double* EXC, double* N_EL,
     double* VXCscalar, int32_t ldvxcscalar, double* VXCz, int32_t ldvxcz,
     double* VXCy     , int32_t ldvxcy     , double* VXCx, int32_t ldvxcx ) override final;
+  void retrieve_fxc_contraction_integrands( double* N_EL,
+    double* FXCs, int32_t ldfxcs, double* FXCz, int32_t ldfxcz,
+    double* FXCy, int32_t ldfxcy, double* FXCx, int32_t ldfxcx ) override final;
   void retrieve_exc_grad_integrands( double* EXC_GRAD, double* N_EL ) override final;
   void retrieve_den_integrands( double* N_EL ) override final;
   void retrieve_exx_integrands( double* K, int32_t ldk ) override final;
@@ -193,6 +364,10 @@ struct XCDeviceStackData : public XCDeviceData {
   double* exc_device_data() override;
   double* nel_device_data() override;
   double* exx_k_device_data() override;
+  double* fxc_s_device_data() override;
+  double* fxc_z_device_data() override;
+  double* fxc_y_device_data() override;
+  double* fxc_x_device_data() override;
   device_queue queue() override;
 
 
diff --git a/src/xc_integrator/xc_data/device/xc_device_task.hpp b/src/xc_integrator/xc_data/device/xc_device_task.hpp
index 696ef185..58ab323c 100644
--- a/src/xc_integrator/xc_data/device/xc_device_task.hpp
+++ b/src/xc_integrator/xc_data/device/xc_device_task.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -52,16 +56,29 @@ struct XCDeviceTask {
   double*   d2bfzz    = nullptr;
   double*   eps     = nullptr;
 
-  double* den       = nullptr;
-  double* gamma     = nullptr;
-  double*   vrho    = nullptr;
-  double*   vgamma  = nullptr;
+  double* den    = nullptr;
+  double* gamma  = nullptr;
+  double* tau    = nullptr;
+  double* lapl   = nullptr;
+  double* vrho   = nullptr;
+  double* vgamma = nullptr;
+  double* vtau   = nullptr;
+  double* vlapl  = nullptr;
     
   // (S,Z,Y,X) densities
   double* den_s     = nullptr;
   double* den_z     = nullptr;
   double* den_y     = nullptr;
   double* den_x     = nullptr;
+  double* tau_s     = nullptr;
+  double* tau_z     = nullptr;
+  double* tau_y     = nullptr;
+  double* tau_x     = nullptr;
+  double* lapl_s    = nullptr;
+  double* lapl_z    = nullptr;
+  double* lapl_y    = nullptr;
+  double* lapl_x    = nullptr;
+
   // Del(S,Z,Y,X) Gradients
   double* dden_sx   = nullptr;
   double* dden_sy   = nullptr;
@@ -85,6 +102,10 @@ struct XCDeviceTask {
   double* vgamma_pp  = nullptr;
   double* vgamma_pm  = nullptr;
   double* vgamma_mm  = nullptr;
+  double* vtau_pos  = nullptr;
+  double* vtau_neg  = nullptr;
+  double* vlapl_pos  = nullptr;
+  double* vlapl_neg  = nullptr;
 
   // GKS K,H matrices
   double* K_z        = nullptr;
@@ -96,10 +117,121 @@ struct XCDeviceTask {
 
   // MGGA
   double*   d2bflapl    = nullptr;
-  double*   denlapl     = nullptr;
-  double*   tau   = nullptr;
-  double*   vtau  = nullptr;
-  double*   vlapl  = nullptr;
+  double*   d3bflapl_x    = nullptr;
+  double*   d3bflapl_y    = nullptr;
+  double*   d3bflapl_z    = nullptr;
+
+  // Persistent X matrices for EXC gradients
+  double* xmatS   = nullptr;
+  double* xmatS_x = nullptr;
+  double* xmatS_y = nullptr;
+  double* xmatS_z = nullptr;
+  double* xmatZ   = nullptr;
+  double* xmatZ_x = nullptr;
+  double* xmatZ_y = nullptr;
+  double* xmatZ_z = nullptr;
+
+  // Second derivatives - Trial density and derivatives
+  double* tden    = nullptr;
+  double* ttau    = nullptr;
+  double* tlapl   = nullptr;
+  double* v2rho2      = nullptr;
+  double* v2rhogamma  = nullptr;
+  double* v2rholapl   = nullptr;
+  double* v2rhotau    = nullptr;
+  double* v2gamma2    = nullptr;
+  double* v2gammalapl = nullptr;
+  double* v2gammatau  = nullptr;
+  double* v2lapl2     = nullptr;
+  double* v2lapltau   = nullptr;
+  double* v2tau2      = nullptr;
+  
+  // (S,Z,Y,X) trial densities
+  double* tden_s     = nullptr;
+  double* tden_z     = nullptr;
+  double* tden_y     = nullptr;
+  double* tden_x     = nullptr;
+  double* ttau_s     = nullptr;
+  double* ttau_z     = nullptr;
+  double* ttau_y     = nullptr;
+  double* ttau_x     = nullptr;
+  double* tlapl_s    = nullptr;
+  double* tlapl_z    = nullptr;
+  double* tlapl_y    = nullptr;
+  double* tlapl_x    = nullptr;
+
+  // Del(S,Z,Y,X) trial density gradients
+  double* tdden_sx   = nullptr;
+  double* tdden_sy   = nullptr;
+  double* tdden_sz   = nullptr;
+  double* tdden_zx   = nullptr;
+  double* tdden_zy   = nullptr;
+  double* tdden_zz   = nullptr;
+  double* tdden_yx   = nullptr;
+  double* tdden_yy   = nullptr;
+  double* tdden_yz   = nullptr;
+  double* tdden_xx   = nullptr;
+  double* tdden_xy   = nullptr;
+  double* tdden_xz   = nullptr;
+  
+  //2C U variables for second derivatives
+  double* v2rho2_a_a = nullptr;
+  double* v2rho2_a_b = nullptr;
+  double* v2rho2_b_b = nullptr;
+  double* v2rhogamma_a_aa = nullptr;
+  double* v2rhogamma_a_ab = nullptr;
+  double* v2rhogamma_a_bb = nullptr;
+  double* v2rhogamma_b_aa = nullptr;
+  double* v2rhogamma_b_ab = nullptr;
+  double* v2rhogamma_b_bb = nullptr;
+  double* v2rholapl_a_a = nullptr;
+  double* v2rholapl_a_b = nullptr;
+  double* v2rholapl_b_a = nullptr;
+  double* v2rholapl_b_b = nullptr;
+  double* v2rhotau_a_a = nullptr;
+  double* v2rhotau_a_b = nullptr;
+  double* v2rhotau_b_a = nullptr;
+  double* v2rhotau_b_b = nullptr;
+  double* v2gamma2_aa_aa = nullptr;
+  double* v2gamma2_aa_ab = nullptr;
+  double* v2gamma2_aa_bb = nullptr;
+  double* v2gamma2_ab_ab = nullptr;
+  double* v2gamma2_ab_bb = nullptr;
+  double* v2gamma2_bb_bb = nullptr;
+  double* v2gammalapl_aa_a = nullptr;
+  double* v2gammalapl_aa_b = nullptr;
+  double* v2gammalapl_ab_a = nullptr;
+  double* v2gammalapl_ab_b = nullptr;
+  double* v2gammalapl_bb_a = nullptr;
+  double* v2gammalapl_bb_b = nullptr;
+  double* v2gammatau_aa_a = nullptr;
+  double* v2gammatau_aa_b = nullptr;
+  double* v2gammatau_ab_a = nullptr;
+  double* v2gammatau_ab_b = nullptr;
+  double* v2gammatau_bb_a = nullptr;
+  double* v2gammatau_bb_b = nullptr;
+  double* v2lapl2_a_a = nullptr;
+  double* v2lapl2_a_b = nullptr;
+  double* v2lapl2_b_b = nullptr;
+  double* v2lapltau_a_a = nullptr;
+  double* v2lapltau_a_b = nullptr;
+  double* v2lapltau_b_a = nullptr;
+  double* v2lapltau_b_b = nullptr;
+  double* v2tau2_a_a = nullptr;
+  double* v2tau2_a_b = nullptr;
+  double* v2tau2_b_b = nullptr;
+
+  // Second derivatives intermediate output
+  double* FXC_A_s = nullptr;
+  double* FXC_Bx_s = nullptr;
+  double* FXC_By_s = nullptr;
+  double* FXC_Bz_s = nullptr;
+  double* FXC_C_s = nullptr;
+  double* FXC_A_z = nullptr;
+  double* FXC_Bx_z = nullptr;
+  double* FXC_By_z = nullptr;
+  double* FXC_Bz_z = nullptr;
+  double* FXC_C_z = nullptr;
 
   int32_t iParent       = -1;
   double dist_nearest   = 0.;
diff --git a/tests/2nd_derivative_test.cxx b/tests/2nd_derivative_test.cxx
new file mode 100644
index 00000000..eedcb27f
--- /dev/null
+++ b/tests/2nd_derivative_test.cxx
@@ -0,0 +1,243 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include "ut_common.hpp"
+#include <gauxc/xc_integrator.hpp>
+#include <gauxc/xc_integrator/impl.hpp>
+#include <gauxc/xc_integrator/integrator_factory.hpp>
+#include <gauxc/molecular_weights.hpp>
+
+#include <gauxc/molgrid/defaults.hpp>
+
+#include <gauxc/external/hdf5.hpp>
+#include <highfive/H5File.hpp>
+#include <Eigen/Core>
+
+using namespace GauXC;
+
+
+void test_fxc_contractioin(ExecutionSpace ex, const RuntimeEnvironment& rt,
+  std::string reference_file, 
+  functional_type& func, 
+  PruningScheme pruning_scheme,
+  std::string integrator_kernel = "Default",  
+  std::string reduction_kernel  = "Default",
+  std::string lwd_kernel        = "Default") {
+
+  // Read the reference file
+  using matrix_type = Eigen::MatrixXd;
+  Molecule mol;
+  BasisSet<double> basis;
+  matrix_type P, Pz, tP, tPz, FXC_ref, FXCz_ref;
+  bool rks = true, uks = false;
+  
+  {
+    read_hdf5_record( mol,   reference_file, "/MOLECULE" );
+    read_hdf5_record( basis, reference_file, "/BASIS"    );
+
+    HighFive::File file( reference_file, HighFive::File::ReadOnly );
+    
+    std::string den = "/DENSITY";
+    std::string tden_str = "/TRIAL_DENSITY";
+    std::string fxc_str = "/FXC";
+    std::string den2 = "/DENSITY_Z";
+
+    if (file.exist("/DENSITY_Z")) { 
+      rks = false; 
+      uks = true;
+      if (file.exist("/DENSITY_Y") && file.exist("/DENSITY_X")) {
+        std::cout << "FXC contraction for GKS is not supported yet. Skipping test." << std::endl;
+        return;
+      }
+    }
+
+    if (uks) {
+      tden_str = "/TRIAL_DENSITY_SCALAR";
+      den = "/DENSITY_SCALAR";
+      fxc_str = "/FXC_SCALAR";
+    }
+     
+    auto dset = file.getDataSet(den);
+    auto dims = dset.getDimensions();
+    
+    P = matrix_type(dims[0], dims[1]);
+    dset.read(P.data());
+    
+    if (not rks) {
+      Pz = matrix_type(dims[0], dims[1]);
+      dset = file.getDataSet(den2);
+      dset.read(Pz.data());
+    }
+    
+    tP = matrix_type(dims[0], dims[1]);
+    dset = file.getDataSet(tden_str);
+    dset.read(tP.data());
+    FXC_ref = matrix_type(dims[0], dims[1]);
+    dset = file.getDataSet(fxc_str);
+    dset.read(FXC_ref.data());
+    
+    if (not rks) {
+      FXCz_ref = matrix_type(dims[0], dims[1]);
+      dset = file.getDataSet("/FXC_Z");
+      dset.read(FXCz_ref.data());
+      tPz = matrix_type(dims[0], dims[1]);
+      dset = file.getDataSet("/TRIAL_DENSITY_Z");
+      dset.read(tPz.data());
+    }
+  }
+
+  // Set shell tolerance
+  for (auto& sh : basis) 
+    sh.set_shell_tolerance(std::numeric_limits<double>::epsilon());
+
+  // Create molecular grid
+  auto mg = MolGridFactory::create_default_molgrid(mol, pruning_scheme,
+    BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid);
+
+  // Construct Load Balancer
+  LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default");
+  auto lb = lb_factory.get_instance(rt, mol, mg, basis);
+
+  // Construct Weights Module
+  MolecularWeightsFactory mw_factory(ex, "Default", MolecularWeightsSettings{});
+  auto mw = mw_factory.get_instance();
+
+  // Apply partition weights
+  mw.modify_weights(lb);
+
+  // Construct XCIntegrator
+  XCIntegratorFactory<matrix_type> integrator_factory(ex, "Replicated", 
+    integrator_kernel, lwd_kernel, reduction_kernel);
+  auto integrator = integrator_factory.get_instance(func, lb);
+
+  // Test FXC contraction
+  if (rks) {
+    // Call FXC contraction
+    auto FXC = integrator.eval_fxc_contraction(P, tP);
+    auto FXC_diff_nrm = (FXC - FXC_ref).norm();
+    CHECK(FXC_diff_nrm / basis.nbf() < 1e-10);
+  } else if (uks) {
+    // Call FXC contraction
+    auto [FXCs, FXCz] = integrator.eval_fxc_contraction(P, Pz, tP, tPz);
+    
+    auto FXCs_diff_nrm = (FXCs - FXC_ref).norm();
+    auto FXCz_diff_nrm = (FXCz - FXCz_ref).norm();
+    CHECK(FXCs_diff_nrm / basis.nbf() < 1e-10);
+    CHECK(FXCz_diff_nrm / basis.nbf() < 1e-10);
+  
+  }
+}
+
+void test_integrator_2nd(std::string reference_file, functional_type& func, PruningScheme pruning_scheme) {
+
+#ifdef GAUXC_HAS_DEVICE
+  auto rt = DeviceRuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD,) 0.9);
+#else
+  auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD));
+#endif
+
+#ifdef GAUXC_HAS_HOST
+    SECTION( "Host" ) {
+      SECTION("Reference") {
+        test_fxc_contractioin( ExecutionSpace::Host, rt, reference_file, func,
+          pruning_scheme, "Default", "Default", "Default" );
+      }
+    }
+#endif
+
+#ifdef GAUXC_HAS_DEVICE
+  SECTION( "Device" ) {
+    SECTION( "Incore - MPI Reduction" ) {
+      test_fxc_contractioin( ExecutionSpace::Device, rt,
+        reference_file, func, pruning_scheme,  
+        "Default", "Default", "Default" );
+    }
+    #ifdef GAUXC_HAS_CUTLASS
+    SECTION( "Incore - MPI Reduction - CUTLASS" ) {
+      test_fxc_contractioin( ExecutionSpace::Device, rt, 
+        reference_file, func, pruning_scheme,
+        "Default", "Default", "Scheme1-CUTLASS" );
+    }
+    #endif
+
+  }
+#endif
+
+}
+
+functional_type make_functional_2nd(ExchCXX::Functional func_key, ExchCXX::Spin spin) {
+  return functional_type(ExchCXX::Backend::builtin, func_key, spin);
+}
+
+
+TEST_CASE( "XC Integrator FXC", "[xc-integrator]" ) {
+
+  auto pol     = ExchCXX::Spin::Polarized;
+  auto unpol   = ExchCXX::Spin::Unpolarized;
+  auto svwn5   = ExchCXX::Functional::SVWN5;
+  auto pbe0    = ExchCXX::Functional::PBE0;
+  auto blyp    = ExchCXX::Functional::BLYP;
+  auto scan    = ExchCXX::Functional::SCAN;
+  auto r2scanl = ExchCXX::Functional::R2SCANL;
+  auto m062x   = ExchCXX::Functional::M062X;
+
+  // LDA Test
+  SECTION( "Benzene / SVWN5 / cc-pVDZ" ) {
+    auto func = make_functional_2nd(svwn5, unpol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5", 
+        func, PruningScheme::Unpruned );
+  }
+  SECTION( "Benzene / SVWN5 / cc-pVDZ (Treutler)" ) {
+    auto func = make_functional_2nd(svwn5, unpol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5", 
+        func, PruningScheme::Treutler );
+  }
+  SECTION( "Benzene / SVWN5 / cc-pVDZ (Robust)" ) {
+    auto func = make_functional_2nd(svwn5, unpol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5", 
+        func, PruningScheme::Robust );
+  }
+
+  // GGA Test
+  SECTION( "Benzene / PBE0 / cc-pVDZ" ) {
+    auto func = make_functional_2nd(pbe0, unpol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5", 
+        func, PruningScheme::Unpruned );
+  }
+
+  // MGGA Test (TAU Only)
+  SECTION( "Cytosine / SCAN / cc-pVDZ") {
+    auto func = make_functional_2nd(scan, unpol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", 
+        func, PruningScheme::Robust );
+  }
+
+  //UKS LDA Test
+  SECTION( "Li / SVWN5 / sto-3g" ) {
+    auto func = make_functional_2nd(svwn5, pol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/li_svwn5_sto3g_uks.bin",
+        func, PruningScheme::Unpruned );
+  }
+
+  //UKS GGA Test
+  SECTION( "Cytosine (doublet) / BLYP / cc-pVDZ") {
+    auto func = make_functional_2nd(blyp, pol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5", 
+        func, PruningScheme::Robust );
+  }
+
+  // UKS MGGA Test (TAU Only)
+  SECTION( "Cytosine (doublet) / SCAN / cc-pVDZ") {
+    auto func = make_functional_2nd(scan, pol);
+    test_integrator_2nd(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5", 
+        func, PruningScheme::Robust );
+  }
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d3881e91..5f00d7db 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
@@ -57,9 +61,12 @@ add_executable( gauxc_test
   environment.cxx
   collocation.cxx
   weights.cxx
+  weight_derivative_test.cxx
   standards.cxx 
   runtime.cxx
   basis/parse_basis.cxx
+  dd_psi_potential_test.cxx
+  2nd_derivative_test.cxx
 )
 target_link_libraries( gauxc_test PUBLIC gauxc gauxc_catch2 Eigen3::Eigen cereal )
 if(GAUXC_ENABLE_CUTLASS)
diff --git a/tests/basis/new/6-31g*.g94 b/tests/basis/new/6-31g-star.g94
similarity index 100%
rename from tests/basis/new/6-31g*.g94
rename to tests/basis/new/6-31g-star.g94
diff --git a/tests/basis/old/6-31g*.g94 b/tests/basis/old/6-31g-star.g94
similarity index 100%
rename from tests/basis/old/6-31g*.g94
rename to tests/basis/old/6-31g-star.g94
diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx
index aef5d612..0bf4cd8e 100644
--- a/tests/basis/parse_basis.cxx
+++ b/tests/basis/parse_basis.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/basis/parse_basis.hpp b/tests/basis/parse_basis.hpp
index 5815584c..1530aebf 100644
--- a/tests/basis/parse_basis.hpp
+++ b/tests/basis/parse_basis.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/basisset_test.cxx b/tests/basisset_test.cxx
index 5e556d10..29565336 100644
--- a/tests/basisset_test.cxx
+++ b/tests/basisset_test.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/cmake/discovery/CMakeLists.txt b/tests/cmake/discovery/CMakeLists.txt
index 3a03749f..e97fd4de 100644
--- a/tests/cmake/discovery/CMakeLists.txt
+++ b/tests/cmake/discovery/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/tests/cmake/discovery/gauxc_link_tester.cxx b/tests/cmake/discovery/gauxc_link_tester.cxx
index 2ba40e22..70313c7f 100644
--- a/tests/cmake/discovery/gauxc_link_tester.cxx
+++ b/tests/cmake/discovery/gauxc_link_tester.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/cmake/subproject/CMakeLists.txt b/tests/cmake/subproject/CMakeLists.txt
index 0bbf72b9..7bf08709 100644
--- a/tests/cmake/subproject/CMakeLists.txt
+++ b/tests/cmake/subproject/CMakeLists.txt
@@ -1,7 +1,11 @@
 #
 # GauXC Copyright (c) 2020-2024, The Regents of the University of California,
 # through Lawrence Berkeley National Laboratory (subject to receipt of
-# any required approvals from the U.S. Dept. of Energy). All rights reserved.
+# any required approvals from the U.S. Dept. of Energy).
+#
+# (c) 2024-2025, Microsoft Corporation
+#
+# All rights reserved.
 #
 # See LICENSE.txt for details
 #
diff --git a/tests/cmake/subproject/gauxc_link_tester.cxx b/tests/cmake/subproject/gauxc_link_tester.cxx
index 2ba40e22..70313c7f 100644
--- a/tests/cmake/subproject/gauxc_link_tester.cxx
+++ b/tests/cmake/subproject/gauxc_link_tester.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/collocation.cxx b/tests/collocation.cxx
index fb8a0393..af85da77 100644
--- a/tests/collocation.cxx
+++ b/tests/collocation.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -74,6 +78,14 @@ TEST_CASE( "Water / cc-pVDZ", "[collocation]" ) {
   SECTION( "CUDA Shell to Task Eval Hessian" ) {
     test_cuda_collocation_shell_to_task_hessian( basis, basis_map, ref_data );
   }
+
+  SECTION( "CUDA Shell to Task Eval Laplacian" ) {
+    test_cuda_collocation_shell_to_task_laplacian( basis, basis_map, ref_data );
+  }
+
+  SECTION( "CUDA Shell to Task Eval Laplacian Gradient" ) {
+    test_cuda_collocation_shell_to_task_lapgrad( basis, basis_map, ref_data );
+  }
 #endif // GAUXC_HAS_CUDA
 
 #ifdef GAUXC_HAS_HIP
diff --git a/tests/collocation_common.hpp b/tests/collocation_common.hpp
index 91baa780..567f8f40 100644
--- a/tests/collocation_common.hpp
+++ b/tests/collocation_common.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -32,10 +36,15 @@ struct ref_collocation_data {
   std::vector<double>               d2eval_yy;
   std::vector<double>               d2eval_yz;
   std::vector<double>               d2eval_zz;
+  std::vector<double>               d2eval_lapl;
+  std::vector<double>               d3eval_lapl_x;
+  std::vector<double>               d3eval_lapl_y;
+  std::vector<double>               d3eval_lapl_z;
 
   template <typename Archive>
   void serialize( Archive& ar ) {
-    ar( mask, pts, eval, deval_x, deval_y, deval_z, d2eval_xx, d2eval_xy, d2eval_xz, d2eval_yy, d2eval_yz, d2eval_zz );
+    ar( mask, pts, eval, deval_x, deval_y, deval_z, d2eval_xx, d2eval_xy, d2eval_xz, 
+        d2eval_yy, d2eval_yz, d2eval_zz, d2eval_lapl, d3eval_lapl_x, d3eval_lapl_y, d3eval_lapl_z);
   }
 
 };
diff --git a/tests/collocation_cuda.hpp b/tests/collocation_cuda.hpp
index 42ae0eb4..b74d8476 100644
--- a/tests/collocation_cuda.hpp
+++ b/tests/collocation_cuda.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -14,7 +18,7 @@
 
 auto populate_device_cuda( const BasisSet<double>& basis,
                            const std::vector<ref_collocation_data>& ref_data,
-                           bool pop_grad, bool pop_hess ) {
+                           bool pop_grad, bool pop_hess, bool pop_lapl, bool pop_lapl_grad ) {
 
   std::vector< XCDeviceTask > tasks;
 
@@ -58,6 +62,16 @@ auto populate_device_cuda( const BasisSet<double>& basis,
       task.d2bfzz = util::cuda_malloc<double>( nbf * npts );
     }
 
+    if(pop_lapl) {
+      task.d2bflapl = util::cuda_malloc<double>( nbf * npts );
+    }
+
+    if(pop_lapl_grad) {
+      task.d3bflapl_x = util::cuda_malloc<double>( nbf * npts );
+      task.d3bflapl_y = util::cuda_malloc<double>( nbf * npts );
+      task.d3bflapl_z = util::cuda_malloc<double>( nbf * npts );
+    }
+
     //auto* pts_device = task.points;
     auto* pts_x_device = task.points_x;
     auto* pts_y_device = task.points_y;
@@ -95,7 +109,7 @@ auto populate_device_cuda( const BasisSet<double>& basis,
 
 void cuda_check_collocation( const std::vector<XCDeviceTask>& tasks,
                              const std::vector<ref_collocation_data>& ref_data,
-                             bool check_grad, bool check_hess) {
+                             bool check_grad, bool check_hess, bool check_lapl, bool check_lapl_grad) {
 
   for( int i = 0; i < tasks.size(); i++ ) {
 
@@ -158,6 +172,34 @@ void cuda_check_collocation( const std::vector<XCDeviceTask>& tasks,
       check_collocation_transpose( npts, nbe, ref_d2eval_zz, d2eval_zz.data(), "IT = " + std::to_string(i) + " BFZZ EVAL" );
     }
 
+    if( check_lapl ) {
+      auto npts = tasks[i].npts;
+      auto nbe  = tasks[i].bfn_screening.nbe;
+      auto* ref_d2eval_lapl = ref_data[i].d2eval_lapl.data();
+      std::vector<double> d2eval_lapl(npts * nbe);
+      util::cuda_copy(eval.size(), d2eval_lapl.data(), tasks[i].d2bflapl);
+      check_collocation_transpose(npts, nbe, ref_d2eval_lapl, d2eval_lapl.data(), "IT = " + std::to_string(i) + "BFLAPL EVAL" );
+    }
+
+#if 1
+    if( check_lapl_grad ) {
+      auto npts = tasks[i].npts;
+      auto nbe  = tasks[i].bfn_screening.nbe;
+      auto* ref_d3eval_lapl_x = ref_data[i].d3eval_lapl_x.data();
+      auto* ref_d3eval_lapl_y = ref_data[i].d3eval_lapl_y.data();
+      auto* ref_d3eval_lapl_z = ref_data[i].d3eval_lapl_z.data();
+      std::vector<double> d3eval_lapl_x(npts * nbe);
+      std::vector<double> d3eval_lapl_y(npts * nbe);
+      std::vector<double> d3eval_lapl_z(npts * nbe);
+      util::cuda_copy(eval.size(), d3eval_lapl_x.data(), tasks[i].d3bflapl_x);
+      util::cuda_copy(eval.size(), d3eval_lapl_y.data(), tasks[i].d3bflapl_y);
+      util::cuda_copy(eval.size(), d3eval_lapl_z.data(), tasks[i].d3bflapl_z);
+      check_collocation_transpose(npts, nbe, ref_d3eval_lapl_x, d3eval_lapl_x.data(), "IT = " + std::to_string(i) + "BFLAPL_X EVAL" );
+      check_collocation_transpose(npts, nbe, ref_d3eval_lapl_y, d3eval_lapl_y.data(), "IT = " + std::to_string(i) + "BFLAPL_Y EVAL" );
+      check_collocation_transpose(npts, nbe, ref_d3eval_lapl_z, d3eval_lapl_z.data(), "IT = " + std::to_string(i) + "BFLAPL_Z EVAL" );
+    }
+#endif
+
   }
 
 }
@@ -186,7 +228,7 @@ void test_cuda_collocation_masked_combined( const BasisSet<double>& basis, std::
 
 
   device_queue stream( std::make_shared<util::cuda_stream>() );
-  auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, false );
+  auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, false, false, false );
 
 
   const auto nshells_max = std::max_element( tasks.begin(), tasks.end(),
@@ -211,7 +253,7 @@ void test_cuda_collocation_masked_combined( const BasisSet<double>& basis, std::
 
   util::cuda_device_sync();
 
-  cuda_check_collocation( tasks, ref_data, grad, false );
+  cuda_check_collocation( tasks, ref_data, grad, false, false, false );
 
 
   for( auto& t : tasks ) {
@@ -249,7 +291,7 @@ void test_cuda_collocation_deriv1( const BasisSet<double>& basis,
 
 
 void test_cuda_collocation_shell_to_task( const BasisSet<double>& basis,  const BasisSetMap& basis_map,
-  std::ifstream& in_file, bool grad, bool hess) {
+  std::ifstream& in_file, bool grad, bool hess, bool lapl, bool lapl_grad) {
 
   // Load reference data
   std::vector<ref_collocation_data> ref_data;
@@ -260,7 +302,7 @@ void test_cuda_collocation_shell_to_task( const BasisSet<double>& basis,  const
 
   // Populate base task information
   device_queue stream( std::make_shared<util::cuda_stream>() );
-  auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, hess );
+  auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, hess, lapl, lapl_grad );
 
   // Send tasks to device
   auto* tasks_device = util::cuda_malloc<XCDeviceTask>( tasks.size() );
@@ -355,9 +397,15 @@ void test_cuda_collocation_shell_to_task( const BasisSet<double>& basis,  const
   }
 
 
-  if( hess )
+  if( lapl_grad )
+    eval_collocation_shell_to_task_lapgrad( max_l, l_batched_shell_to_task.data(), 
+      tasks_device, stream );
+  else if( hess )
     eval_collocation_shell_to_task_hessian( max_l, l_batched_shell_to_task.data(), 
       tasks_device, stream );
+  else if( lapl )
+    eval_collocation_shell_to_task_laplacian( max_l, l_batched_shell_to_task.data(), 
+      tasks_device, stream );
   else if( grad ) 
     eval_collocation_shell_to_task_gradient( max_l, l_batched_shell_to_task.data(), 
       tasks_device, stream );
@@ -368,13 +416,15 @@ void test_cuda_collocation_shell_to_task( const BasisSet<double>& basis,  const
 
 
   util::cuda_device_sync();
-  cuda_check_collocation( tasks, ref_data, grad, hess );
+  cuda_check_collocation( tasks, ref_data, grad, hess, lapl, lapl_grad );
 
       
   for( auto& t : tasks ) {
     util::cuda_free( t.points_x, t.points_y, t.points_z, t.bfn_screening.shell_offs, t.bfn_screening.shell_list, t.bf );
     if(grad) util::cuda_free( t.dbfx, t.dbfy, t.dbfz );
     if(hess) util::cuda_free( t.d2bfxx, t.d2bfxy, t.d2bfxz, t.d2bfyy, t.d2bfyz, t.d2bfzz );
+    if(lapl) util::cuda_free( t.d2bflapl );
+    if(lapl_grad) util::cuda_free( t.d3bflapl_x, t.d3bflapl_y, t.d3bflapl_z );
   }
   util::cuda_free( tasks_device, shells_device, shell_to_task_device );
   for( auto& s : shell_to_task ) {
@@ -387,19 +437,33 @@ void test_cuda_collocation_shell_to_task( const BasisSet<double>& basis,  const
 void test_cuda_collocation_shell_to_task( const BasisSet<double>& basis,  
   const BasisSetMap& basis_map, std::ifstream& in_file) {
 
-  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,false, false);
+  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,false, false, false, false);
 
 }
 void test_cuda_collocation_shell_to_task_gradient( const BasisSet<double>& basis,  
   const BasisSetMap& basis_map, std::ifstream& in_file) {
 
-  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, false);
+  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, false, false, false);
 
 }
 void test_cuda_collocation_shell_to_task_hessian( const BasisSet<double>& basis,  
   const BasisSetMap& basis_map, std::ifstream& in_file) {
 
-  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, true);
+  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, true, false, false);
+
+}
+
+void test_cuda_collocation_shell_to_task_laplacian( const BasisSet<double>& basis,  
+  const BasisSetMap& basis_map, std::ifstream& in_file) {
+
+  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, false, true, false);
+
+}
+
+void test_cuda_collocation_shell_to_task_lapgrad( const BasisSet<double>& basis,  
+  const BasisSetMap& basis_map, std::ifstream& in_file) {
+
+  test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, true, true, true);
 
 }
 
diff --git a/tests/collocation_hip.hpp b/tests/collocation_hip.hpp
index c6314aac..b6be897c 100644
--- a/tests/collocation_hip.hpp
+++ b/tests/collocation_hip.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/collocation_host.hpp b/tests/collocation_host.hpp
index a64ce8ee..52dcaec0 100644
--- a/tests/collocation_host.hpp
+++ b/tests/collocation_host.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -46,13 +50,40 @@ void generate_collocation_data( const Molecule& mol, const BasisSet<double>& bas
                         d2eval_xz( nbf * npts ),
                         d2eval_yy( nbf * npts ),
                         d2eval_yz( nbf * npts ),
-                        d2eval_zz( nbf * npts );
-
-    gau2grid_collocation_hessian( npts, mask.size(), nbf,
+                        d2eval_zz( nbf * npts ),
+                        d3eval_xxx( nbf * npts ),
+                        d3eval_xxy( nbf * npts ),
+                        d3eval_xxz( nbf * npts ),
+                        d3eval_xyy( nbf * npts ),
+                        d3eval_xyz( nbf * npts ),
+                        d3eval_xzz( nbf * npts ),
+                        d3eval_yyy( nbf * npts ),
+                        d3eval_yyz( nbf * npts ),
+                        d3eval_yzz( nbf * npts ),
+                        d3eval_zzz( nbf * npts );
+
+    gau2grid_collocation_der3( npts, mask.size(), nbf,
       pts.data()->data(), basis, mask.data(), eval.data(), 
       deval_x.data(), deval_y.data(), deval_z.data(),
       d2eval_xx.data(), d2eval_xy.data(), d2eval_xz.data(),
-      d2eval_yy.data(), d2eval_yz.data(), d2eval_zz.data() );
+      d2eval_yy.data(), d2eval_yz.data(), d2eval_zz.data(),
+      d3eval_xxx.data(), d3eval_xxy.data(), d3eval_xxz.data(),
+      d3eval_xyy.data(), d3eval_xyz.data(), d3eval_xzz.data(),
+      d3eval_yyy.data(), d3eval_yyz.data(), d3eval_yzz.data(),
+      d3eval_zzz.data());
+
+    std::vector<double> d2eval_lapl(nbf * npts);
+    std::vector<double> d3eval_lapl_x(nbf * npts);
+    std::vector<double> d3eval_lapl_y(nbf * npts);
+    std::vector<double> d3eval_lapl_z(nbf * npts);
+    for(auto i = 0; i < nbf*npts; ++i) {
+      d2eval_lapl[i] = d2eval_xx[i] + d2eval_yy[i] + d2eval_zz[i];
+      d3eval_lapl_x[i] = d3eval_xxx[i] + d3eval_xyy[i] + d3eval_xzz[i];
+      d3eval_lapl_y[i] = d3eval_xxy[i] + d3eval_yyy[i] + d3eval_yzz[i];
+      d3eval_lapl_z[i] = d3eval_xxz[i] + d3eval_yyz[i] + d3eval_zzz[i];
+    }
+
+    
 
     auto max_abs = *std::max_element( eval.begin(), eval.end(),
                    [](auto a, auto b){ return std::abs(a) < std::abs(b); } );
@@ -61,7 +92,9 @@ void generate_collocation_data( const Molecule& mol, const BasisSet<double>& bas
     ref_collocation_data d{ std::move(mask), std::move(pts), std::move(eval),
                             std::move(deval_x), std::move(deval_y), std::move(deval_z),
                             std::move(d2eval_xx), std::move(d2eval_xy), std::move(d2eval_xz),
-                            std::move(d2eval_yy), std::move(d2eval_yz), std::move(d2eval_zz) 
+                            std::move(d2eval_yy), std::move(d2eval_yz), std::move(d2eval_zz),
+                            std::move(d2eval_lapl), std::move(d3eval_lapl_x), std::move(d3eval_lapl_y),
+                            std::move(d3eval_lapl_z)
                             };
 
     ref_data.emplace_back( std::move(d) );
diff --git a/tests/conv_cereal_to_hdf5.cxx b/tests/conv_cereal_to_hdf5.cxx
index d717cf63..682a6964 100644
--- a/tests/conv_cereal_to_hdf5.cxx
+++ b/tests/conv_cereal_to_hdf5.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/dd_psi_potential_test.cxx b/tests/dd_psi_potential_test.cxx
new file mode 100644
index 00000000..9af2844e
--- /dev/null
+++ b/tests/dd_psi_potential_test.cxx
@@ -0,0 +1,102 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include "ut_common.hpp"
+#include <gauxc/xc_integrator.hpp>
+#include <gauxc/xc_integrator/impl.hpp>
+#include <gauxc/xc_integrator/integrator_factory.hpp>
+#include <gauxc/molecular_weights.hpp>
+
+#include <gauxc/molgrid/defaults.hpp>
+
+#include <gauxc/external/hdf5.hpp>
+#include <highfive/H5File.hpp>
+#include <Eigen/Core>
+
+using namespace GauXC;
+
+void test_dd_psi (
+  std::string reference_file, 
+  int lmax = 8
+) {
+    using matrix_type = Eigen::MatrixXd;
+    Molecule mol;
+    BasisSet<double> basis;
+    matrix_type P, ddX, ddPsi_ref, ddPsi_potential_ref;
+
+    read_hdf5_record( mol,   reference_file, "/MOLECULE" );
+    read_hdf5_record( basis, reference_file, "/BASIS"    );
+
+    HighFive::File file( reference_file, HighFive::File::ReadOnly );
+    std::string den_str = "/DENSITY";
+    auto dset = file.getDataSet(den_str);
+    auto dims = dset.getDimensions();
+    P = matrix_type( dims[0], dims[1] );
+    dset.read( P.data() );
+
+    int nharmonics = (lmax + 1) * (lmax + 1);
+
+    ddX = matrix_type( nharmonics, mol.size() );
+    dset = file.getDataSet("/DD_X");
+    dset.read(ddX.data());
+
+    ddPsi_ref = matrix_type( mol.size(), nharmonics );
+    dset = file.getDataSet("/DD_PSI");
+    dset.read( ddPsi_ref.data());
+
+    ddPsi_potential_ref = matrix_type( basis.nbf(), basis.nbf() );
+    dset = file.getDataSet("/DD_PSI_POTENTIAL");
+    dset.read( ddPsi_potential_ref.data() );
+
+
+    #ifdef GAUXC_HAS_DEVICE
+    auto rt = DeviceRuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD,) 0.9);
+    #else
+    auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD));
+    #endif
+
+    auto mg = MolGridFactory::create_default_molgrid(mol, PruningScheme::Unpruned,
+    BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid);
+
+    auto ex = ExecutionSpace::Host;
+    LoadBalancerFactory lb_factory(ex, "Default");
+    auto lb = lb_factory.get_instance(rt, mol, mg, basis);
+
+        // Construct Weights Module
+    MolecularWeightsFactory mw_factory( ex, "Default", MolecularWeightsSettings{} );
+    auto mw = mw_factory.get_instance();
+
+    // Apply partition weights
+    mw.modify_weights(lb);
+
+    functional_type func = functional_type( ExchCXX::Backend::builtin, ExchCXX::Functional::PBE0, ExchCXX::Spin::Unpolarized );
+        // Construct XCIntegrator
+    XCIntegratorFactory<matrix_type> integrator_factory( ex, "Replicated", 
+            "Default",  "Default",  "Default" );
+    auto integrator = integrator_factory.get_instance( func, lb );
+
+    auto dd_psi = integrator.eval_dd_psi(P, lmax);
+    auto ddPsi = Eigen::Map<matrix_type>(dd_psi.data(), mol.size(), nharmonics);
+    auto ddPsi_nrm = (ddPsi - ddPsi_ref).norm();
+    CHECK( ddPsi_nrm / mol.size() < 1e-10 );
+
+    auto ddPsiPotential = integrator.eval_dd_psi_potential(ddX, lmax);
+    auto ddPsiPotential_nrm = (ddPsiPotential - ddPsi_potential_ref).norm();
+    CHECK( ddPsiPotential_nrm / basis.nbf() < 1e-10 );
+
+}
+
+TEST_CASE( "DD PSI & PSI POTENTIAL", "[dd]" ) {
+    SECTION( " C2H4 / def2-svp / LMAX = 8" ) {
+        test_dd_psi( GAUXC_REF_DATA_PATH "/c2h4_l8_dd_psi_potential.hdf5" );
+    }
+}
+ 
\ No newline at end of file
diff --git a/tests/eigen3_matrix_serialization.hpp b/tests/eigen3_matrix_serialization.hpp
index 38537170..a810e8d1 100644
--- a/tests/eigen3_matrix_serialization.hpp
+++ b/tests/eigen3_matrix_serialization.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/environment.cxx b/tests/environment.cxx
index ae9ec82b..2dea8138 100644
--- a/tests/environment.cxx
+++ b/tests/environment.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/grid_opt.cxx b/tests/grid_opt.cxx
index ced74d20..1fa2237f 100644
--- a/tests/grid_opt.cxx
+++ b/tests/grid_opt.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/grid_test.cxx b/tests/grid_test.cxx
index 4e8782e6..c308adf8 100644
--- a/tests/grid_test.cxx
+++ b/tests/grid_test.cxx
@@ -1,16 +1,20 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
 #include "catch2/catch.hpp"
 #include <gauxc/grid.hpp>
 
-#include <integratorxx/quadratures/lebedev_laikov.hpp>
-#include <integratorxx/quadratures/muraknowles.hpp>
-#include <integratorxx/quadratures/mhl.hpp>
+#include <integratorxx/quadratures/radial/muraknowles.hpp>
+#include <integratorxx/quadratures/radial/mhl.hpp>
+#include <integratorxx/quadratures/s2/lebedev_laikov.hpp>
 #include <integratorxx/composite_quadratures/spherical_quadrature.hpp>
 
 #include <random>
diff --git a/tests/ini_input.cxx b/tests/ini_input.cxx
index 972eeaaa..a5f6ed56 100644
--- a/tests/ini_input.cxx
+++ b/tests/ini_input.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/ini_input.hpp b/tests/ini_input.hpp
index 1577f2ee..6be84086 100644
--- a/tests/ini_input.hpp
+++ b/tests/ini_input.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/load_balancer_test.cxx b/tests/load_balancer_test.cxx
index 1c55f3e8..889bcb30 100644
--- a/tests/load_balancer_test.cxx
+++ b/tests/load_balancer_test.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/molgrid_test.cxx b/tests/molgrid_test.cxx
index 191b8e7b..1de1be98 100644
--- a/tests/molgrid_test.cxx
+++ b/tests/molgrid_test.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/moltypes_test.cxx b/tests/moltypes_test.cxx
index 5ed00c55..d87685f7 100644
--- a/tests/moltypes_test.cxx
+++ b/tests/moltypes_test.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/ref_data/benzene_m062x_def2-svp_ufg_ssf.hdf5 b/tests/ref_data/benzene_m062x_def2-svp_ufg_ssf.hdf5
new file mode 100644
index 00000000..e4eebc23
Binary files /dev/null and b/tests/ref_data/benzene_m062x_def2-svp_ufg_ssf.hdf5 differ
diff --git a/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 b/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5
index c1d3ebc3..51bfa6ac 100644
Binary files a/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 and b/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 differ
diff --git a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5
index 9acf2378..3bf4d5fa 100644
Binary files a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 and b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 differ
diff --git a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5
index f7242869..61a765bd 100644
Binary files a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 and b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 differ
diff --git a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5
index 478d988e..c021e3c8 100644
Binary files a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 and b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 differ
diff --git a/tests/ref_data/c2h4_l8_dd_psi_potential.hdf5 b/tests/ref_data/c2h4_l8_dd_psi_potential.hdf5
new file mode 100644
index 00000000..2187f79d
Binary files /dev/null and b/tests/ref_data/c2h4_l8_dd_psi_potential.hdf5 differ
diff --git a/tests/ref_data/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5 b/tests/ref_data/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5
new file mode 100644
index 00000000..3fc56cda
Binary files /dev/null and b/tests/ref_data/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5 differ
diff --git a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5
index d44d7c0f..06cf00ff 100644
Binary files a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 and b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 differ
diff --git a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5
index 829f8e96..53ca387a 100644
Binary files a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 and b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 differ
diff --git a/tests/ref_data/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5 b/tests/ref_data/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5
new file mode 100644
index 00000000..3496bd81
Binary files /dev/null and b/tests/ref_data/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5 differ
diff --git a/tests/ref_data/li_svwn5_sto3g_uks.bin b/tests/ref_data/li_svwn5_sto3g_uks.bin
index b654d996..d96aeba2 100644
Binary files a/tests/ref_data/li_svwn5_sto3g_uks.bin and b/tests/ref_data/li_svwn5_sto3g_uks.bin differ
diff --git a/tests/ref_data/ut_input.inp b/tests/ref_data/ut_input.inp
index 46901ab1..4f0455fc 100644
--- a/tests/ref_data/ut_input.inp
+++ b/tests/ref_data/ut_input.inp
@@ -8,4 +8,5 @@ func = svwn5
 integrate_vxc = TRUE
 integrate_exc_grad = TRUE
 integrate_exx = FALSE
+integrate_fxc_contraction = FALSE
 OUTFILE = benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5
diff --git a/tests/ref_data/water_cc-pVDZ_collocation.bin b/tests/ref_data/water_cc-pVDZ_collocation.bin
index c6d22ab4..e2d7ea60 100644
Binary files a/tests/ref_data/water_cc-pVDZ_collocation.bin and b/tests/ref_data/water_cc-pVDZ_collocation.bin differ
diff --git a/tests/runtime.cxx b/tests/runtime.cxx
index 1f8b6933..5b459940 100644
--- a/tests/runtime.cxx
+++ b/tests/runtime.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx
index efc9ce59..68a9c13a 100644
--- a/tests/standalone_driver.cxx
+++ b/tests/standalone_driver.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -51,6 +55,7 @@ int main(int argc, char** argv) {
 
     // Optional Args
     std::string grid_spec          = "ULTRAFINE";
+    std::string rad_quad_spec      = "MURAKNOWLES";
     std::string prune_spec         = "UNPRUNED";
     std::string lb_exec_space_str  = "Host";
     std::string int_exec_space_str = "Host";
@@ -66,6 +71,10 @@ int main(int argc, char** argv) {
     bool integrate_vxc      = true;
     bool integrate_exx      = false;
     bool integrate_exc_grad = false;
+    bool integrate_dd_psi   = false;
+    bool integrate_dd_psi_potential  = false;
+    bool integrate_fxc_contraction   = false;
+    int lmax = 2;
 
     auto string_to_upper = []( auto& str ) {
       std::transform( str.begin(), str.end(), str.begin(), ::toupper );
@@ -79,6 +88,7 @@ int main(int argc, char** argv) {
     OPTIONAL_KEYWORD( "GAUXC.GRID",              grid_spec,          std::string );
     OPTIONAL_KEYWORD( "GAUXC.FUNC",              func_spec,          std::string );
     OPTIONAL_KEYWORD( "GAUXC.PRUNING_SCHEME",    prune_spec,         std::string );
+    OPTIONAL_KEYWORD( "GAUXC.RAD_QUAD",          rad_quad_spec,      std::string );
     OPTIONAL_KEYWORD( "GAUXC.LB_EXEC_SPACE",     lb_exec_space_str,  std::string );
     OPTIONAL_KEYWORD( "GAUXC.INT_EXEC_SPACE",    int_exec_space_str, std::string );
     OPTIONAL_KEYWORD( "GAUXC.INTEGRATOR_KERNEL", integrator_kernel,  std::string );
@@ -86,6 +96,7 @@ int main(int argc, char** argv) {
     OPTIONAL_KEYWORD( "GAUXC.REDUCTION_KERNEL",  reduction_kernel,   std::string );
     string_to_upper( grid_spec          );
     string_to_upper( func_spec          );
+    string_to_upper( rad_quad_spec      );
     string_to_upper( prune_spec         );
     string_to_upper( lb_exec_space_str  );
     string_to_upper( int_exec_space_str );
@@ -100,6 +111,10 @@ int main(int argc, char** argv) {
     OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_VXC",      integrate_vxc,      bool );
     OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_EXX",      integrate_exx,      bool );
     OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_EXC_GRAD", integrate_exc_grad, bool );
+    OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_DD_PSI",   integrate_dd_psi,   bool );
+    OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_DD_PSI_POTENTIAL",   integrate_dd_psi_potential,   bool );
+    OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_FXC_CONTRACTION",   integrate_fxc_contraction,   bool );
+    OPTIONAL_KEYWORD( "GAUXC.MAX_YLM",      lmax,  int );
 
     IntegratorSettingsSNLinK sn_link_settings;
     OPTIONAL_KEYWORD( "EXX.TOL_E", sn_link_settings.energy_tol, double );
@@ -124,6 +139,7 @@ int main(int argc, char** argv) {
       std::cout << "DRIVER SETTINGS: " << std::endl
                 << "  REF_FILE          = " << ref_file << std::endl
                 << "  GRID              = " << grid_spec << std::endl
+                << "  RAD_QUAD          = " << rad_quad_spec << std::endl
                 << "  PRUNING_SCHEME    = " << prune_spec << std::endl
                 << "  BATCH_SIZE        = " << batch_size << std::endl
                 << "  BASIS_TOL         = " << basis_tol << std::endl
@@ -136,13 +152,19 @@ int main(int argc, char** argv) {
                 << "  DEN (?)           = " << integrate_den << std::endl
                 << "  VXC (?)           = " << integrate_vxc << std::endl
                 << "  EXX (?)           = " << integrate_exx << std::endl
-                << "  EXC_GRAD (?)      = " << integrate_exc_grad << std::endl;
+                << "  EXC_GRAD (?)      = " << integrate_exc_grad << std::endl
+                << "  DD_PSI (?)        = " << integrate_dd_psi << std::endl
+                << "  DD_PSI_POTENTIAL (?)       = " << integrate_dd_psi_potential << std::endl
+                << "  FXC_CONTRACTION (?)       = " << integrate_fxc_contraction << std::endl;
                 if(integrate_exx) {
                   std::cout << "  EXX.TOL_E         = " 
                             << sn_link_settings.energy_tol << std::endl
                             << "  EXX.TOL_K         = " 
                             << sn_link_settings.k_tol << std::endl;
                 }
+                if (integrate_dd_psi || integrate_dd_psi_potential) {
+                  std::cout << " DD_MAX_YLM        = " << lmax << std::endl;
+                }
                 std::cout << std::endl;
     }
 
@@ -170,9 +192,19 @@ int main(int argc, char** argv) {
       {"TREUTLER", PruningScheme::Treutler}
     };
 
+    std::map< std::string, RadialQuad > rad_quad_map = {
+      {"BECKE",             RadialQuad::Becke},
+      {"MURAKNOWLES",       RadialQuad::MuraKnowles},
+      {"TREUTLERAHLRICHS",  RadialQuad::TreutlerAhlrichs},
+      {"MURRAYHANDYLAMING", RadialQuad::MurrayHandyLaming},
+      {"MK",                RadialQuad::MuraKnowles},
+      {"TA",                RadialQuad::TreutlerAhlrichs},
+      {"MHL",               RadialQuad::MurrayHandyLaming}
+    };
+
     auto mg = MolGridFactory::create_default_molgrid(mol, 
      prune_map.at(prune_spec), BatchSize(batch_size), 
-     RadialQuad::MuraKnowles, mg_map.at(grid_spec));
+     rad_quad_map.at(rad_quad_spec), mg_map.at(grid_spec));
 
     // Read BasisSet
     BasisSet<double> basis; 
@@ -195,6 +227,8 @@ int main(int argc, char** argv) {
     using matrix_type = Eigen::MatrixXd;
     // Read in reference data
     matrix_type P, Pz, Py, Px, VXC_ref, VXCz_ref, VXCy_ref, VXCx_ref, K_ref;
+    matrix_type ddX, ddPsi_ref, ddPsi_potential_ref;
+    matrix_type FXC_ref, FXCz_ref;
     double EXC_ref;
     std::vector<double> EXC_GRAD_ref(3*mol.size());
     bool rks = true, uks = false, gks = false;
@@ -325,7 +359,69 @@ int main(int argc, char** argv) {
           K_ref.fill(0);
         }
       }
+      if ( integrate_dd_psi ) {
+        int nharmonics = (lmax + 1) * (lmax + 1);
+        ddPsi_ref = matrix_type( mol.size(), nharmonics );
+        try {
+          dset = file.getDataSet("/DD_PSI");
+          dset.read( ddPsi_ref.data());
+          auto dd_psi_dims = dset.getDimensions();
+          if (dd_psi_dims[0] != mol.size() or dd_psi_dims[1] != nharmonics)
+            GAUXC_GENERIC_EXCEPTION("Incorrect dims for DD_PSI");
+        } catch(...) {
+          if(world_rank == 0) {
+            std::cout << "** Warning: Could Not Find Reference DD_PSI" << std::endl;
+          }
+          ddPsi_ref.fill(0);
+        }
+      }
+
+      if ( integrate_dd_psi_potential ) {
+        int nharmonics = (lmax + 1) * (lmax + 1);
+        ddX = matrix_type( nharmonics, mol.size() );
+        ddPsi_potential_ref = matrix_type( basis.nbf(), basis.nbf() );
+        try {
+          dset = file.getDataSet("/DD_X");
+          auto dd_x_dims = dset.getDimensions();
+          if (dd_x_dims[0] != nharmonics or dd_x_dims[1] != mol.size())
+            GAUXC_GENERIC_EXCEPTION("Incorrect dims for DD_X");
+          dset.read(ddX.data());
+        } catch(...) {
+          throw std::runtime_error("Could Not Find Input DD_X for DD_PSI_POTENTIAL");
+        }
+        try {
+          dset = file.getDataSet("/DD_PSI_POTENTIAL");
+          auto dd_psi_potential_dims = dset.getDimensions();
+          if (dd_psi_potential_dims[0] != basis.nbf() or dd_psi_potential_dims[1] != basis.nbf())
+            GAUXC_GENERIC_EXCEPTION("Incorrect dims for DD_PSI_POTENTIAL");
+          dset.read(ddPsi_potential_ref.data());
+        } catch(...) {
+          if(world_rank == 0) {
+            std::cout << "** Warning: Could Not Find Reference DD_PSI_POTENTIAL" << std::endl;
+          }
+          ddPsi_potential_ref.fill(0);
+        }
+      }
 
+      if ( integrate_fxc_contraction ) {
+        try {
+          dset = file.getDataSet("/FXC");
+          auto fxc_dims = dset.getDimensions();
+          FXC_ref = matrix_type( fxc_dims[0], fxc_dims[1] );
+          dset.read( FXC_ref.data() );
+          if( not rks ) {
+            dset = file.getDataSet("/FXC_Z");
+            FXCz_ref = matrix_type( fxc_dims[0], fxc_dims[1] );
+            dset.read( FXCz_ref.data() );
+          }
+        } catch(...) {
+          if(world_rank == 0) {
+            std::cout << "** Warning: Could Not Find Reference FXC" << std::endl;
+          }
+          FXC_ref.fill(0);
+          if( not rks ) FXCz_ref.fill(0);
+        }
+      }
     }
     // Setup XC functional
     auto polar = (uks or gks) ? Spin::Polarized : Spin::Unpolarized;
@@ -333,7 +429,9 @@ int main(int argc, char** argv) {
     if(functional_map.key_exists(func_spec)) {
       func = functional_type( Backend::builtin, functional_map.value(func_spec), 
         polar );
-    } else { 
+    } 
+#ifdef EXCHCXX_ENABLE_LIBXC
+    else { 
       std::vector<std::pair<double, ExchCXX::XCKernel>> funcs;
       std::vector<std::string> libxc_names;
       split(libxc_names, func_spec, ",");
@@ -342,6 +440,7 @@ int main(int argc, char** argv) {
       }
       func = functional_type(funcs);
     }
+#endif
 
     // Setup Integrator
     XCIntegratorFactory<matrix_type> integrator_factory( int_exec_space , 
@@ -353,7 +452,8 @@ int main(int argc, char** argv) {
 #endif
     auto xc_int_start = std::chrono::high_resolution_clock::now();
 
-    matrix_type VXC, VXCz, VXCy, VXCx, K;
+    matrix_type VXC, VXCz, VXCy, VXCx, K, FXC, FXCz;
+    matrix_type ddPsi, ddPsiPotential;
     double EXC, N_EL;
 
     std::cout << std::scientific << std::setprecision(12);
@@ -397,8 +497,7 @@ int main(int argc, char** argv) {
         EXC_GRAD = integrator.eval_exc_grad( P );
       }
       else if( uks ) {
-        std::cout << "Warning: eval_exc_grad + UKS NYI!" << std::endl;
-        //EXC_GRAD = integrator.eval_exc_grad( P, Pz );
+        EXC_GRAD = integrator.eval_exc_grad( P, Pz );
       }
       else if( gks ) {
         std::cout << "Warning: eval_exc_grad + GKS NYI!" << std::endl;
@@ -417,12 +516,94 @@ int main(int argc, char** argv) {
       }
     }
 
+    // Load trial density matrices for FXC contraction
+    matrix_type tP, tPz;
+    if( integrate_fxc_contraction ) {
+      bool create_trial_densities = false;
+      {
+      // Try to load trial density matrices from reference file
+      HighFive::File file( ref_file, HighFive::File::ReadOnly );
+      std::string tden_str = "/TRIAL_DENSITY";
+      std::string fxc_str = "/FXC";
+
+      if (!rks) {
+        tden_str = "/TRIAL_DENSITY_SCALAR";
+        fxc_str = "/FXC_SCALAR";
+      }
+
+      try {
+        auto dset = file.getDataSet(tden_str);
+        auto dims = dset.getDimensions();
+        tP = matrix_type(dims[0], dims[1]);
+        dset.read(tP.data());
+
+        if (!rks) {
+          dset = file.getDataSet("/TRIAL_DENSITY_Z");
+          tPz = matrix_type(dims[0], dims[1]);
+          dset.read(tPz.data());
+        }
+
+        // Also try to read reference FXC matrices if available
+        try {
+          dset = file.getDataSet(fxc_str);
+          FXC_ref = matrix_type(dims[0], dims[1]);
+          dset.read(FXC_ref.data());
+
+          if (!rks) {
+            dset = file.getDataSet("/FXC_Z");
+            FXCz_ref = matrix_type(dims[0], dims[1]);
+            dset.read(FXCz_ref.data());
+          }
+        } catch(...) {
+          if(world_rank == 0) {
+            std::cout << "** Warning: Could Not Find Reference FXC" << std::endl;
+          }
+          FXC_ref.fill(0);
+          if(!rks) FXCz_ref.fill(0);
+        }
+
+      } catch(...) {
+        if(world_rank == 0) {
+          std::cout << "** Trial density matrices not found, generating random symmetric matrices..." << std::endl;
+          create_trial_densities = true;
+        }
+      }
+        
+      }
+
+      if(!world_rank) {
+        std::cout << "Computing FXC contraction..." << std::endl;
+      }
+      
+      // Compute FXC contraction
+      if( rks ) {
+        FXC = integrator.eval_fxc_contraction( P, tP, IntegratorSettingsXC{} );
+      } else if( uks ) {
+        std::tie(FXC, FXCz) = integrator.eval_fxc_contraction( P, Pz, tP, tPz, IntegratorSettingsXC{} );
+      } else if( gks ) {
+        std::cout << "Warning: FXC contraction with GKS NYI!" << std::endl;
+      }
+
+    }
+
     if( integrate_exx ) {
       K = integrator.eval_exx(P, sn_link_settings);
       //matrix_type K_tmp = 0.5 * (K + K.transpose());
       //K = -K_tmp;
     } else { K = K_ref; }
 
+
+    if( integrate_dd_psi ) {
+      size_t Ylm_sz = (lmax + 1) * ( lmax + 1);
+      auto dd_psi = integrator.eval_dd_psi(P, lmax);
+      ddPsi = Eigen::Map<matrix_type>(dd_psi.data(), mol.size(), Ylm_sz);
+    } else { ddPsi = ddPsi_ref; }
+
+    if (integrate_dd_psi_potential) {
+      ddPsiPotential = integrator.eval_dd_psi_potential(ddX, lmax);
+    } else { ddPsiPotential = ddPsi_potential_ref; }
+    
+
 #ifdef GAUXC_HAS_MPI
     MPI_Barrier( MPI_COMM_WORLD );
 #endif
@@ -560,6 +741,26 @@ int main(int argc, char** argv) {
       std::cout << "RMS K Diff     = " << (K_ref - K).norm() / basis.nbf()
                                          << std::endl;
       }
+      if (integrate_dd_psi) {
+        std::cout << "| DD_PSI (ref)  |_F = " << ddPsi_ref.norm() << std::endl;
+        std::cout << "| DD_PSI (calc) |_F = " << ddPsi.norm() << std::endl;
+        std::cout << "RMS DD_PSI Diff     = " << (ddPsi_ref - ddPsi).norm() / mol.size() << std::endl;
+      }
+      if (integrate_dd_psi_potential) {
+        std::cout << "| DD_PSI_POTENTIAL (ref)  |_F = " << ddPsi_potential_ref.norm() << std::endl;
+        std::cout << "| DD_PSI_POTENTIAL (calc) |_F = " << ddPsiPotential.norm() << std::endl;
+        std::cout << "RMS DD_PSI_POTENTIAL Diff     = " << (ddPsi_potential_ref - ddPsiPotential).norm() / basis.nbf() << std::endl;
+      }
+      if (integrate_fxc_contraction) {
+        std::cout << "| FXC (ref)  |_F = " << FXC_ref.norm() << std::endl;
+        std::cout << "| FXC (calc) |_F = " << FXC.norm() << std::endl;
+        std::cout << "RMS FXC Diff     = " << (FXC_ref - FXC).norm() / basis.nbf() << std::endl;
+        if (not rks) {
+          std::cout << "| FXCz (ref)  |_F = " << FXCz_ref.norm() << std::endl;
+          std::cout << "| FXCz (calc) |_F = " << FXCz.norm() << std::endl;
+          std::cout << "RMS FXCz Diff     = " << (FXCz_ref - FXCz).norm() / basis.nbf() << std::endl;
+        }
+      }
     }
 
     // Dump out new file
@@ -625,6 +826,27 @@ int main(int argc, char** argv) {
         dset = file.createDataSet<double>( "/EXC_GRAD", grad_space );
         dset.write_raw( EXC_GRAD.data() );
       }
+
+      if (integrate_dd_psi) {
+        HighFive::DataSpace dd_psi_space( mol.size(), (lmax + 1) * (lmax + 1) );
+        dset = file.createDataSet<double>("/DD_PSI", dd_psi_space);
+        dset.write_raw(ddPsi.data());
+      }
+
+      if (integrate_dd_psi_potential) {
+        HighFive::DataSpace dd_psi_potential_space(basis.nbf(), basis.nbf());
+        dset = file.createDataSet<double>("/DD_PSI_POTENTIAL", dd_psi_potential_space);
+        dset.write_raw(ddPsiPotential.data());
+      }
+
+      if (integrate_fxc_contraction) {
+        dset = file.createDataSet<double>("/FXC" + ugks_scalar, mat_space);
+        dset.write_raw(FXC.data());
+        if (not rks) {
+          dset = file.createDataSet<double>("/FXC_Z", mat_space);
+          dset.write_raw(FXCz.data());
+        }
+      }
     }
 
   }
diff --git a/tests/standards.cxx b/tests/standards.cxx
index 6ca6473a..170e73ee 100644
--- a/tests/standards.cxx
+++ b/tests/standards.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -1406,7 +1410,7 @@ Molecule make_ubiquitin() {
 
 BasisSet<double> make_631Gd( const Molecule& mol, SphericalType sph ) {
 
-  std::string basis_path = GAUXC_REF_DATA_PATH  "/../basis/old/6-31g*.g94";
+  std::string basis_path = GAUXC_REF_DATA_PATH  "/../basis/old/6-31g-star.g94";
   return parse_basis( mol, basis_path, sph );
 
 }
diff --git a/tests/standards.hpp b/tests/standards.hpp
index a9db6759..93a6b298 100644
--- a/tests/standards.hpp
+++ b/tests/standards.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/ut_common.hpp.in b/tests/ut_common.hpp.in
index 628ef5f4..6c0c00a0 100644
--- a/tests/ut_common.hpp.in
+++ b/tests/ut_common.hpp.in
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/ut_main.cxx b/tests/ut_main.cxx
index 0ccd3be1..75420515 100644
--- a/tests/ut_main.cxx
+++ b/tests/ut_main.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -12,10 +16,18 @@
 #ifdef GAUXC_HAS_MPI
 #include <mpi.h>
 #endif
+#ifdef GAUXC_HAS_CUDA
+#include <cuda_runtime.h>
+#endif
 
 int main( int argc, char* argv[] ) {
 #ifdef GAUXC_HAS_MPI
   MPI_Init(&argc, &argv);
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#ifdef GAUXC_HAS_CUDA
+  cudaSetDevice(rank);
+#endif
   int result = Catch::Session().run( argc, argv );
   MPI_Finalize();
 #else
diff --git a/tests/weight_derivative_test.cxx b/tests/weight_derivative_test.cxx
new file mode 100644
index 00000000..ec53daf8
--- /dev/null
+++ b/tests/weight_derivative_test.cxx
@@ -0,0 +1,398 @@
+/**
+ * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
+ * through Lawrence Berkeley National Laboratory (subject to receipt of
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
+ *
+ * See LICENSE.txt for details
+ */
+#include "ut_common.hpp"
+#include <gauxc/molecule.hpp>
+#include <gauxc/molmeta.hpp>
+#include <gauxc/xc_task.hpp>
+#include <gauxc/external/hdf5.hpp>
+#include <gauxc/load_balancer.hpp>
+#include <gauxc/molecular_weights.hpp>
+#include <gauxc/runtime_environment.hpp>
+#include <gauxc/molgrid/defaults.hpp>
+#include <gauxc/xc_integrator/local_work_driver.hpp>
+
+// Include weights implementation
+#include "xc_integrator/local_work_driver/host/reference/weights.hpp"
+
+using namespace GauXC;
+
+// Helper function to compute weights for a task
+void compute_weights_task(XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, XCTask& task) {
+  // Construct local work driver
+  auto lwd = LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace::Host, "Default", LocalWorkSettings() );
+  auto* lwd_host = dynamic_cast<LocalHostWorkDriver*>(lwd.get());
+
+  std::vector<XCTask> tasks = {task};
+  lwd_host->partition_weights(weight_alg, mol, meta, tasks.begin(), tasks.end());
+
+  // Copy the computed weights back to the original task
+  task.weights = tasks[0].weights;
+}
+
+// Helper function to compute weights for a task
+void compute_int(XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, XCTask& task, 
+                 double* f_eval, double* result) {
+  std::vector<XCTask> tasks = {task};
+  
+  auto lwd = LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace::Host, "Default", LocalWorkSettings() );
+  auto* lwd_host = dynamic_cast<LocalHostWorkDriver*>(lwd.get());
+  lwd_host->partition_weights(weight_alg, mol, meta, tasks.begin(), tasks.end());
+
+  for (size_t i = 0; i < task.points.size(); i++) {
+    result[0] += tasks[0].weights[i] * f_eval[i];
+  }
+}
+
+
+// Test function that reads molecule and basis from reference file
+void test_weight_1st_deri_host_fdiff(const std::string& reference_file, XCWeightAlg weight_alg,
+                                        PruningScheme pruning_scheme, double fdiff_step, double fdiff_tolerance) {
+
+  // Create runtime environment
+  auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD));
+  Molecule mol;
+  BasisSet<double> basis;
+  
+  // Read molecule and basis from HDF5 reference file
+  read_hdf5_record(mol, reference_file, "/MOLECULE");
+  read_hdf5_record(basis, reference_file, "/BASIS");
+  
+  // Set shell tolerance for numerical stability
+  for(auto& sh : basis) {
+    sh.set_shell_tolerance(std::numeric_limits<double>::epsilon());
+  }
+  auto mg = MolGridFactory::create_default_molgrid(mol, pruning_scheme,
+    BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid);
+
+  // Construct Load Balancer
+  LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default");
+  auto lb = lb_factory.get_instance(rt, mol, mg, basis);
+
+  
+  // Get all XC tasks
+  auto& tasks = lb.get_tasks();
+  size_t natoms = mol.size();
+  size_t ntask = tasks.size();
+
+  auto get_xyz_pointer = [](Atom& atom, size_t i_coord) {
+    switch(i_coord) {
+      case 0: return &atom.x; // X coordinate
+      case 1: return &atom.y; // Y coordinate
+      case 2: return &atom.z; // Z coordinate
+      default: throw std::out_of_range("Invalid coordinate index");
+    }
+  };
+
+  // Calculate finite difference derivatives as ref
+  std::vector<std::vector<double>> weight_derivatives_ref(ntask);
+  for(size_t i_task = 0; i_task < ntask; i_task++) {
+    weight_derivatives_ref[i_task].resize(3 * natoms * tasks[i_task].npts);
+  }
+  for( size_t i_atom = 0; i_atom < mol.size(); i_atom++ ) {
+    for( size_t i_coord = 0; i_coord < 3; i_coord++ ) {
+      // Create perturbed molecules
+      Molecule mol_plus = mol;
+      Molecule mol_minus = mol;
+      
+      // Perturb atom coordinates
+      double* coord_ptr_plus = get_xyz_pointer(mol_plus[i_atom], i_coord);
+      double* coord_ptr_minus = get_xyz_pointer(mol_minus[i_atom], i_coord);
+      double delta = fdiff_step; // Use provided finite difference step
+      *coord_ptr_plus += delta;   // Perturb in positive direction
+      *coord_ptr_minus -= delta;  // Perturb in negative direction
+      
+      // Create metadata for perturbed molecules
+      MolMeta meta_plus(mol_plus);
+      MolMeta meta_minus(mol_minus);
+      
+      // Compute weights for perturbed geometries
+      for(size_t itask = 0; itask < ntask; itask++) {
+        XCTask task_plus = tasks[itask];
+        XCTask task_minus = tasks[itask];      
+        if (i_atom == (size_t)task_plus.iParent) {
+          for(size_t ipt = 0; ipt < task_plus.npts; ipt++) {
+            task_plus.points[ipt][i_coord] += delta;
+            task_minus.points[ipt][i_coord] -= delta;
+          }
+        }
+        task_plus.dist_nearest = meta_plus.dist_nearest()[task_plus.iParent];
+        task_minus.dist_nearest = meta_minus.dist_nearest()[task_minus.iParent];
+
+        // Compute weights for perturbed geometries
+        compute_weights_task(weight_alg, mol_plus, meta_plus, task_plus);
+        compute_weights_task(weight_alg, mol_minus, meta_minus, task_minus);
+      
+        // Compute centered finite difference
+        for(size_t ipt = 0; ipt < task_plus.npts; ipt++) {
+          weight_derivatives_ref[itask][3 * natoms * ipt + 3 * i_atom + i_coord] =
+            (task_plus.weights[ipt] - task_minus.weights[ipt]) / (2.0 * delta);
+        }
+      }
+    }
+  }
+
+
+  // Test derivatives for all tasks
+  for(size_t task_idx = 0; task_idx < ntask; task_idx++) {
+    auto& task = tasks[task_idx];
+    
+    INFO("Testing task " << task_idx << " with " << task.npts << " points");
+    
+    // Create MolMeta
+    MolMeta meta(mol);    // Compute analytical derivatives
+    std::vector<double> analytical_derivatives(3 * natoms * task.npts);
+    compute_weights_task(weight_alg, mol, meta, task);
+  
+    switch( weight_alg ) {
+      case XCWeightAlg::Becke:
+        reference_becke_weights_1st_derivative_host(mol, meta, task, analytical_derivatives.data());
+        break;
+      case XCWeightAlg::SSF:
+        reference_ssf_weights_1st_derivative_host(mol, meta, task, analytical_derivatives.data());
+        break;
+      default:
+        GAUXC_GENERIC_EXCEPTION("Weight Alg Not Supported");
+    }
+
+    // Compare with numerical derivatives
+    double max_error = 0.0;
+    for(size_t ipt = 0; ipt < task.npts; ipt++) {
+      for(size_t iatom = 0; iatom < natoms; iatom++) {        
+        for(size_t icoord = 0; icoord < 3; icoord++) {
+          size_t idx = 3 * natoms * ipt + 3 * iatom + icoord;
+          double error = std::abs(analytical_derivatives[idx] - weight_derivatives_ref[task_idx][idx]);
+          max_error = std::max(max_error, error);
+          
+          INFO("Task " << task_idx << ", Point " << ipt << ", Atom " << iatom << ", Coord " << icoord 
+                << " iParent: " << task.iParent);
+          INFO("Analytical: " << analytical_derivatives[idx]);
+          INFO("Numerical: " << weight_derivatives_ref[task_idx][idx]);
+          INFO("Error: " << error);
+          
+          REQUIRE(analytical_derivatives[idx] == Approx(weight_derivatives_ref[task_idx][idx]).margin(fdiff_tolerance));
+          
+        }
+      }
+    }
+    
+    // Report statistics for this task
+    INFO("Task " << task_idx << " - Total derivatives tested: " << (task.npts * natoms * 3));
+    INFO("Task " << task_idx << " - Maximum error: " << max_error);
+  }
+
+
+}
+
+
+
+// Test function that reads molecule and basis from reference file
+void test_weight_1st_deri_host_fdiff_contracted(const std::string& reference_file, XCWeightAlg weight_alg,
+                                        PruningScheme pruning_scheme, double fdiff_step, double fdiff_tolerance) {
+
+  // Create runtime environment
+  auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD));
+  Molecule mol;
+  BasisSet<double> basis;
+  
+  // Read molecule and basis from HDF5 reference file
+  read_hdf5_record(mol, reference_file, "/MOLECULE");
+  read_hdf5_record(basis, reference_file, "/BASIS");
+  
+  // Set shell tolerance for numerical stability
+  for(auto& sh : basis) {
+    sh.set_shell_tolerance(std::numeric_limits<double>::epsilon());
+  }
+  auto mg = MolGridFactory::create_default_molgrid(mol, pruning_scheme,
+    BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid);
+
+  // Construct Load Balancer
+  LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default");
+  auto lb = lb_factory.get_instance(rt, mol, mg, basis);
+  
+  // Get all XC tasks
+  auto& tasks = lb.get_tasks();
+  size_t natoms = mol.size();
+  size_t ntask = tasks.size();
+
+  // Sort tasks on size (XXX: maybe doesnt matter?)
+  auto task_comparator = []( const XCTask& a, const XCTask& b ) {
+    return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe);
+  };
+  std::stable_sort( tasks.begin(), tasks.end(), task_comparator );
+  
+  // generate a random f_eval vector
+  std::vector<std::vector<double>> f_evals(ntask);
+  for(size_t i_task = 0; i_task < ntask; i_task++) {
+    f_evals[i_task].resize(tasks[i_task].npts);
+    for(size_t i_pt = 0; i_pt < tasks[i_task].npts; i_pt++) {
+      f_evals[i_task][i_pt] = static_cast<double>(rand()) / RAND_MAX; // Random value between 0 and 1
+    }
+  }
+
+
+  auto get_xyz_pointer = [](Atom& atom, size_t i_coord) {
+    switch(i_coord) {
+      case 0: return &atom.x; // X coordinate
+      case 1: return &atom.y; // Y coordinate
+      case 2: return &atom.z; // Z coordinate
+      default: throw std::out_of_range("Invalid coordinate index");
+    }
+  };
+
+  // Calculate finite difference derivatives as ref
+  std::vector<std::vector<double>> exc_grad_w_ref(ntask);
+  for(size_t i_task = 0; i_task < ntask; i_task++) {
+    exc_grad_w_ref[i_task].resize(3 * natoms);
+  }
+  for( size_t i_atom = 0; i_atom < mol.size(); i_atom++ ) {
+    for( size_t i_coord = 0; i_coord < 3; i_coord++ ) {
+      // Create perturbed molecules
+      Molecule mol_plus = mol;
+      Molecule mol_minus = mol;
+      
+      // Perturb atom coordinates
+      double* coord_ptr_plus = get_xyz_pointer(mol_plus[i_atom], i_coord);
+      double* coord_ptr_minus = get_xyz_pointer(mol_minus[i_atom], i_coord);
+      double delta = fdiff_step; // Use provided finite difference step
+      *coord_ptr_plus += delta;   // Perturb in positive direction
+      *coord_ptr_minus -= delta;  // Perturb in negative direction
+      
+      // Create metadata for perturbed molecules
+      MolMeta meta_plus(mol_plus);
+      MolMeta meta_minus(mol_minus);
+      
+      // Compute weights for perturbed geometries
+      for(size_t itask = 0; itask < ntask; itask++) {
+        XCTask task_plus = tasks[itask];
+        XCTask task_minus = tasks[itask];      
+        if (i_atom == (size_t)task_plus.iParent) {
+          for(size_t ipt = 0; ipt < task_plus.npts; ipt++) {
+            task_plus.points[ipt][i_coord] += delta;
+            task_minus.points[ipt][i_coord] -= delta;
+          }
+        }
+        task_plus.dist_nearest = meta_plus.dist_nearest()[task_plus.iParent];
+        task_minus.dist_nearest = meta_minus.dist_nearest()[task_minus.iParent];
+
+        // Compute weights for perturbed geometries
+        double result_plus = 0.0, result_minus = 0.0;
+        compute_int(weight_alg, mol_plus, meta_plus, task_plus, f_evals[itask].data(), &result_plus);
+        compute_int(weight_alg, mol_minus, meta_minus, task_minus, f_evals[itask].data(), &result_minus);
+      
+        // Compute centered finite difference
+        exc_grad_w_ref[itask][3 * i_atom + i_coord] =
+          (result_plus - result_minus) / (2.0 * delta);
+      }
+    }
+  }
+  
+  // Construct Weights Module
+  MolecularWeightsFactory mw_factory(ExecutionSpace::Host, "Default", MolecularWeightsSettings{weight_alg, false});
+  auto mw = mw_factory.get_instance();
+  // Apply partition weights
+  mw.modify_weights(lb);
+
+  // check lb.state().xc_weight_alg() == weight_alg;
+  REQUIRE(lb.state().weight_alg == weight_alg);
+
+  auto lwd = LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace::Host, "Default", LocalWorkSettings() );
+  auto* lwd_host = dynamic_cast<LocalHostWorkDriver*>(lwd.get());
+
+  // Create MolMeta
+  MolMeta meta(mol);    
+  
+  // Test derivatives for all tasks
+  std::vector<std::vector<double>> w_times_fs(ntask);
+  for(size_t task_idx = 0; task_idx < ntask; task_idx++) {
+    auto& task = tasks[task_idx];
+    
+    INFO("Testing task " << task_idx << " with " << task.npts << " points");
+    
+    auto w_times_f = w_times_fs[task_idx];
+    w_times_f.resize(task.npts);
+    for(size_t i = 0; i < task.npts; i++) {
+      w_times_f[i] = task.weights[i] * f_evals[task_idx][i];
+    }
+
+    // Compute analytical derivatives
+    std::vector<double> analytical_derivatives(3 * natoms);
+    lwd_host->eval_weight_1st_deriv_contracted(weight_alg, mol, meta, task, w_times_f.data(), analytical_derivatives.data());
+
+    // Compare with numerical derivatives
+    double max_error = 0.0;
+    for(size_t iatom = 0; iatom < natoms; iatom++) {        
+      for(size_t icoord = 0; icoord < 3; icoord++) {
+        size_t idx = 3 * iatom + icoord;
+        double error = std::abs(analytical_derivatives[idx] - exc_grad_w_ref[task_idx][idx]);
+        max_error = std::max(max_error, error);
+        
+        INFO("Task " << task_idx << ", Atom " << iatom << ", Coord " << icoord 
+              << " iParent: " << task.iParent);
+        INFO("Analytical: " << analytical_derivatives[idx]);
+        INFO("Numerical: " << exc_grad_w_ref[task_idx][idx]);
+        INFO("Error: " << error);
+        
+        REQUIRE(analytical_derivatives[idx] == Approx(exc_grad_w_ref[task_idx][idx]).margin(fdiff_tolerance));
+        
+      }
+    }
+    
+    // Report statistics for this task
+    INFO("Task " << task_idx << " - Total derivatives tested: " << (task.npts * natoms * 3));
+    INFO("Task " << task_idx << " - Maximum error: " << max_error);
+  }
+
+
+}
+
+TEST_CASE("Weights First Derivative uncontracted HOST fidiff", "[weights_fdiff]") {
+  
+
+  SECTION( "H3 Becke" ) {
+  test_weight_1st_deri_host_fdiff(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::Becke,
+                                      PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+  SECTION( "H3 SSF" ) {
+  test_weight_1st_deri_host_fdiff(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::SSF,
+                                      PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+  
+}
+
+
+TEST_CASE("Weights First Derivative contracted HOST fidiff", "[weights_fdiff]") {
+  
+
+  SECTION( "H3 Becke" ) {
+  test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::Becke,
+                                      PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+
+  // SECTION( "Benzene Becke" ) {
+  // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5", XCWeightAlg::Becke,
+  //                                     PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+
+  // SECTION( "Cytosine Becke" ) {
+  // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", XCWeightAlg::Becke,
+  //                                     PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+  
+
+  SECTION( "H3 SSF" ) {
+  test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::SSF,
+                                      PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+  // SECTION( "Benzene SSF" ) {
+  // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5", XCWeightAlg::SSF,
+  //                                     PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+
+  // SECTION( "Cytosine SSF" ) {
+  // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", XCWeightAlg::SSF,
+  //                                     PruningScheme::Unpruned, 1.0e-5, 1.0e-6);}
+  
+
+}
\ No newline at end of file
diff --git a/tests/weights.cxx b/tests/weights.cxx
index a56df0fc..e9069a52 100644
--- a/tests/weights.cxx
+++ b/tests/weights.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/weights_cuda.hpp b/tests/weights_cuda.hpp
index bd1561a5..3951cda4 100644
--- a/tests/weights_cuda.hpp
+++ b/tests/weights_cuda.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/weights_generate.hpp b/tests/weights_generate.hpp
index 7d586a8b..465c0bf8 100644
--- a/tests/weights_generate.hpp
+++ b/tests/weights_generate.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/weights_hip.hpp b/tests/weights_hip.hpp
index d5ad45f7..478a7556 100644
--- a/tests/weights_hip.hpp
+++ b/tests/weights_hip.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/weights_host.hpp b/tests/weights_host.hpp
index e78694d8..f9c51417 100644
--- a/tests/weights_host.hpp
+++ b/tests/weights_host.hpp
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx
index 88527fa8..947a9914 100644
--- a/tests/xc_integrator.cxx
+++ b/tests/xc_integrator.cxx
@@ -1,7 +1,11 @@
 /**
  * GauXC Copyright (c) 2020-2024, The Regents of the University of California,
  * through Lawrence Berkeley National Laboratory (subject to receipt of
- * any required approvals from the U.S. Dept. of Energy). All rights reserved.
+ * any required approvals from the U.S. Dept. of Energy).
+ *
+ * (c) 2024-2025, Microsoft Corporation
+ *
+ * All rights reserved.
  *
  * See LICENSE.txt for details
  */
@@ -37,8 +41,8 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt,
   BasisSet<double> basis;
   matrix_type P, Pz, Py, Px, VXC_ref, VXCz_ref, VXCy_ref, VXCx_ref, K_ref;
   double EXC_ref;
-  std::vector<double> EXC_GRAD_ref;
-  bool has_k = false, has_exc_grad = false, rks = true, uks = false, gks = false;
+  std::vector<double> EXC_GRAD_ref_HellFey, EXC_GRAD_ref_Full;
+  bool has_k = false, has_exc_grad_HellFey = false, has_exc_grad_full = false, rks = true, uks = false, gks = false;
   {
     read_hdf5_record( mol,   reference_file, "/MOLECULE" );
     read_hdf5_record( basis, reference_file, "/BASIS"    );
@@ -110,11 +114,40 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt,
     dset = file.getDataSet("/EXC");
     dset.read( &EXC_ref );
 
-    has_exc_grad = file.exist("/EXC_GRAD");
-    if( has_exc_grad ) {
-      EXC_GRAD_ref.resize( 3*mol.size() );
+    // Check for new unified /EXC_GRAD dataset with attribute
+    if( file.exist("/EXC_GRAD") ) {
       dset = file.getDataSet("/EXC_GRAD");
-      dset.read( EXC_GRAD_ref.data() );
+      EXC_GRAD_ref_Full.resize( 3*mol.size() );
+      
+      // Check for attribute indicating whether weight derivatives are included
+      bool exc_grad_includes_weight_derivatives = false; // Default to Hellmann-Feynman
+      try {
+        auto attr = dset.getAttribute("includes_weight_derivatives");
+        int attr_value;
+        attr.read( attr_value );
+        exc_grad_includes_weight_derivatives = (attr_value != 0);
+      } catch(... ) { }
+      
+      if( exc_grad_includes_weight_derivatives ) {
+        dset.read( EXC_GRAD_ref_Full.data() );
+        has_exc_grad_full = true;
+      } else {
+        dset.read( EXC_GRAD_ref_HellFey.data() );
+        has_exc_grad_HellFey = true;
+      }
+    }
+    // Check for other type of EXC_GRAD
+    if( file.exist("/EXC_GRAD_HELLFEY") and not has_exc_grad_HellFey ) {
+      EXC_GRAD_ref_HellFey.resize( 3*mol.size() );
+      dset = file.getDataSet("/EXC_GRAD_HELLFEY");
+      dset.read( EXC_GRAD_ref_HellFey.data() );
+      has_exc_grad_HellFey = true;
+    }
+    if( file.exist("/EXC_GRAD_FULL") and not has_exc_grad_full ) {
+      EXC_GRAD_ref_Full.resize( 3*mol.size() );
+      dset = file.getDataSet("/EXC_GRAD_FULL");
+      dset.read( EXC_GRAD_ref_Full.data() );
+      has_exc_grad_full = true;
     }
     
     has_k = file.exist("/K");
@@ -125,7 +158,7 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt,
     }
   }
 
-  if( (uks or gks) and ex == ExecutionSpace::Device and func.is_mgga() ) return;
+  if( gks and ex == ExecutionSpace::Device and func.is_mgga() ) return;
 
   for( auto& sh : basis ) 
     sh.set_shell_tolerance( std::numeric_limits<double>::epsilon() );
@@ -240,14 +273,29 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt,
 
 
   // Check EXC Grad
-  if( check_grad and has_exc_grad and rks) {
-    auto EXC_GRAD = integrator.eval_exc_grad( P );
+  if( check_grad and has_exc_grad_full ) {
+    IntegratorSettingsEXC_GRAD exc_grad_settings;
+    exc_grad_settings.include_weight_derivatives = true; // Use full gradient (default)
+    auto EXC_GRAD = rks ? integrator.eval_exc_grad( P, exc_grad_settings ) : integrator.eval_exc_grad( P, Pz, exc_grad_settings );
     using map_type = Eigen::Map<Eigen::MatrixXd>;
-    map_type EXC_GRAD_ref_map( EXC_GRAD_ref.data(), mol.size(), 3 );
+    map_type EXC_GRAD_ref_map( EXC_GRAD_ref_Full.data(), mol.size(), 3 );
     map_type EXC_GRAD_map( EXC_GRAD.data(), mol.size(), 3 );
     auto EXC_GRAD_diff_nrm = (EXC_GRAD_ref_map - EXC_GRAD_map).norm();
-    CHECK( EXC_GRAD_diff_nrm / std::sqrt(3.0*mol.size()) < 1e-10 );
+    INFO("comparing full gradient");
+    CHECK( EXC_GRAD_diff_nrm / std::sqrt(3.0*mol.size()) < 1e-8 );
   }
+  if( check_grad and has_exc_grad_HellFey ) {
+    IntegratorSettingsEXC_GRAD exc_grad_settings;
+    exc_grad_settings.include_weight_derivatives = false; // Use Hellmann-Feynman gradient
+    auto EXC_GRAD = rks ? integrator.eval_exc_grad( P, exc_grad_settings ) : integrator.eval_exc_grad( P, Pz, exc_grad_settings );
+    using map_type = Eigen::Map<Eigen::MatrixXd>;
+    map_type EXC_GRAD_ref_map( EXC_GRAD_ref_HellFey.data(), mol.size(), 3 );
+    map_type EXC_GRAD_map( EXC_GRAD.data(), mol.size(), 3 );
+    auto EXC_GRAD_diff_nrm = (EXC_GRAD_ref_map - EXC_GRAD_map).norm();
+    INFO("comparing Hellmann-Feynman gradient");
+    CHECK( EXC_GRAD_diff_nrm / std::sqrt(3.0*mol.size()) < 1e-8 );
+  }
+
 
   // Check K
   if( has_k and check_k and rks ) {
@@ -311,12 +359,10 @@ void test_integrator(std::string reference_file, functional_type& func, PruningS
 
     #ifdef GAUXC_HAS_CUTLASS
     SECTION( "Incore - MPI Reduction - CUTLASS" ) {
-      if(not func.is_mgga() and not func.is_polarized()) {
-        test_xc_integrator( ExecutionSpace::Device, rt, 
-          reference_file, func, pruning_scheme,
-          false, true, false, "Default", "Default", 
-          "Scheme1-CUTLASS" );
-      }
+      test_xc_integrator( ExecutionSpace::Device, rt, 
+        reference_file, func, pruning_scheme,
+        true, true, false, "Default", "Default", 
+        "Scheme1-CUTLASS" );
     }
     #endif
 
@@ -329,11 +375,11 @@ void test_integrator(std::string reference_file, functional_type& func, PruningS
     }
     #endif
 
-    SECTION( "ShellBatched" ) {
-      test_xc_integrator( ExecutionSpace::Device, rt, 
-        reference_file, func, pruning_scheme,  
-        false, false, false, "ShellBatched" );
-    }
+    // SECTION( "ShellBatched" ) {
+    //   test_xc_integrator( ExecutionSpace::Device, rt, 
+    //     reference_file, func, pruning_scheme,  
+    //     false, false, false, "ShellBatched" );
+    // }
   }
 #endif
 
@@ -353,6 +399,7 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) {
   auto blyp    = ExchCXX::Functional::BLYP;
   auto scan    = ExchCXX::Functional::SCAN;
   auto r2scanl = ExchCXX::Functional::R2SCANL;
+  auto m062x   = ExchCXX::Functional::M062X;
 
   // LDA Test
   SECTION( "Benzene / SVWN5 / cc-pVDZ" ) {
@@ -384,6 +431,12 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) {
     test_integrator(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", 
         func, PruningScheme::Robust );
   }
+  // This tests gradients
+  SECTION( "Benzene / M06-2X / def2-svp") {
+    auto func = make_functional(m062x, unpol);
+    test_integrator(GAUXC_REF_DATA_PATH "/benzene_m062x_def2-svp_ufg_ssf.hdf5",
+        func, PruningScheme::Unpruned );
+  }
 
   // MGGA Test (TAU + LAPL)
   SECTION( "Cytosine / R2SCANL / cc-pVDZ") {
@@ -398,6 +451,12 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) {
     test_integrator(GAUXC_REF_DATA_PATH "/li_svwn5_sto3g_uks.bin",
         func, PruningScheme::Unpruned );
   }
+  // + grad
+  SECTION( "Cytosine (doublet) / SVWN5 / cc-pVDZ") {
+    auto func = make_functional(svwn5, pol);
+    test_integrator(GAUXC_REF_DATA_PATH "/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5", 
+        func, PruningScheme::Robust );
+  }
 
   //UKS GGA Test
   SECTION( "Li / BLYP / sto-3g" ) {
@@ -405,6 +464,12 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) {
     test_integrator(GAUXC_REF_DATA_PATH "/li_blyp_sto3g_uks.bin",
         func, PruningScheme::Unpruned );
   }
+  // + grad
+  SECTION( "Cytosine (doublet) / BLYP / cc-pVDZ") {
+    auto func = make_functional(blyp, pol);
+    test_integrator(GAUXC_REF_DATA_PATH "/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5", 
+        func, PruningScheme::Robust );
+  }
 
   // UKS MGGA Test (TAU Only)
   SECTION( "Cytosine (doublet) / SCAN / cc-pVDZ") {