diff --git a/.github/workflows/build_and_test_compiler_zoo.yml b/.github/workflows/build_and_test_compiler_zoo.yml index 430e3f37..38ea93ae 100644 --- a/.github/workflows/build_and_test_compiler_zoo.yml +++ b/.github/workflows/build_and_test_compiler_zoo.yml @@ -14,7 +14,7 @@ jobs: image: dbwy/chemistry strategy: matrix: - compiler: [ {suite: gnu, version: 12}, {suite: llvm, version: 14} ] + compiler: [ {suite: gnu, version: 12} ] mpi_flag: [ON, OFF] openmp_flag: [ON, OFF] exclude: diff --git a/.gitignore b/.gitignore index d92d4623..65b35b0d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ src/xc_integrator/local_work_driver/host/obara_saika/test/*.x src/xc_integrator/local_work_driver/host/obara_saika/generator/integral* src/xc_integrator/local_work_driver/host/obara_saika/generator/obara* src/xc_integrator/local_work_driver/host/obara_saika/generator/*.x +*.swp diff --git a/CMakeLists.txt b/CMakeLists.txt index 57cd29f8..94efc973 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required( VERSION 3.20 FATAL_ERROR ) include(FetchContent) set( FETCHCONTENT_UPDATES_DISCONNECTED ON CACHE BOOL "Disable FC Updates" ) -project( GauXC VERSION 0.0.1 LANGUAGES C CXX ) +project( GauXC VERSION 1.0.0 LANGUAGES C CXX ) # Place local modules in the path list( PREPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..686e5e7a --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,10 @@ +# Microsoft Open Source Code of Conduct + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). + +Resources: + +- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) +- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns +- Employees can reach out at [aka.ms/opensource/moderation-support](https://aka.ms/opensource/moderation-support) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..ebf23aca --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,14 @@ +# Contributing + +This project welcomes contributions and suggestions. Most contributions require you to +agree to a Contributor License Agreement (CLA) declaring that you have the right to, +and actually do, grant us the rights to use your contribution. For details, visit +https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need +to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the +instructions provided by the bot. You will only need to do this once across all repositories using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 00000000..689d4e67 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,17 @@ +# This is the list of GauXC's significant contributors. +# +# This does not necessarily list everyone who has contributed code. +# To see the full list of contributors, see the revision history in +# source control. + +Primary Developer and Maintainer: David Williams--Young - Microsoft (davidwillia at microsoft dot com) + +* Thom Popovici (LBNL) +* Teri Lambros (UW) +* Mikael Kovtun (UW) +* Daniel Mejia-Rodriguez (PNNL) + +* Yingrong Chen (Microsoft) +* Jiashu Liang (Microsoft) +* David Clark (NVIDIA) +* Damon McDougall (AMD) diff --git a/LICENSE.txt b/LICENSE.txt index c4c69413..f2904dad 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,10 @@ -GauXC Copyright (c) 2020, The Regents of the University of California, +GauXC Copyright (c) 2020-2024, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of -any required approvals from the U.S. Dept. of Energy). All rights reserved. +any required approvals from the U.S. Dept. of Energy). + +(c) 2024-2025, Microsoft Corporation + +All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/NOTICE.md b/NOTICE.md new file mode 100644 index 00000000..4fcbf5de --- /dev/null +++ b/NOTICE.md @@ -0,0 +1,38 @@ +# NOTICES + +This repository incorporates material as listed below or described in the code. + +------------------------------------------------------------------------------- +gau2grid. + +BSD 3-Clause License + +Copyright (c) 2017, Daniel Smith +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +------------------------------------------------------------------------------- + diff --git a/README.md b/README.md index 35fa5f97..082ac6cd 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,15 @@ # About -GauXC Copyright (c) 2020-2024, The Regents of the University of California, +GauXC + +Copyright (c) 2020-2024, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of -any required approvals from the U.S. Dept. of Energy). All rights reserved. +any required approvals from the U.S. Dept. of Energy). + +(c) 2024-2025, Microsoft Corporation + +All rights reserved. -If you have questions about your rights to use or distribute this software, -please contact Berkeley Lab's Intellectual Property Office at -IPO@lbl.gov. NOTICE. This Software was developed under funding from the U.S. Department of Energy and the U.S. Government consequently retains certain rights. As @@ -29,12 +32,7 @@ frameworks to target NVIDIA and AMD GPUs, respectively. Evaluation of the XC functional CPU/accelerator architectures is provided by the [ExchCXX](https://github.com/wavefunction91/ExchCXX) library. Quadratures are generated -by the [IntegratorXX](https://github.com/wavefunction91/IntegratorXX). - -GauXC is a work in progress. Its development has been funded by the U.S. -Department of Energy Exascale Computing Project -([NWChemEx](https://github.com/NWChemEx-Project)). - +by the [IntegratorXX](https://github.com/wavefunction91/IntegratorXX) library. # Design Goals @@ -62,17 +60,7 @@ for flexible and agile development in the field of KS-DFT. # Major Contributors -Primary Developer and Maintainer: David Williams--Young - LBNL (dbwy at lbl dot gov) - -GauXC has received major contributions from the following developers (in no particular order): -* Thom Popovici (LBNL) - Optimized sn-K kernels for CPU and GPU architectures -* Teri Lambros (UW) - Unrestricted (UKS) and Generalized (GKS) DFT -* Daniel Mejia-Rodriguez (PNNL) - Meta-GGA DFT - -We have also receieved significant support from industry collaborators: -* David Clark (NVIDIA) - Optimization of critical kernels for NVIDIA architectures -* Damon McDougall (AMD) - Optimization of critical kernels for AMDGPU architectures - +See CONTRIBUTORS.md for a list of major contributors to GauXC. # Publications @@ -229,7 +217,7 @@ target_link_libraries( my_target PUBLIC gauxc::gauxc ) # Example Usage -Coming Soon.... See `test/standalone_driver.cxx` for an example end-to-end invocation of GauXC for various integrands. +See `test/standalone_driver.cxx` for an example end-to-end invocation of GauXC for various integrands. # License @@ -239,6 +227,15 @@ LICENSE.txt for details. # Acknowledgments -The development of GauXC is supported by the Exascale Computing Project +The development of GauXC was previously supported by the Exascale Computing Project (17-SC-20-SC), a collaborative effort of the U.S. Department of Energy Office of Science and the National Nuclear Security Administration. + +##Trademarks + +This project may contain trademarks or logos for projects, products, or +services. Authorized use of Microsoft trademarks or logos is subject to and +must follow Microsoft’s Trademark & Brand Guidelines. Use of Microsoft +trademarks or logos in modified versions of this project must not cause +confusion or imply Microsoft sponsorship. Any use of third-party trademarks or +logos are subject to those third-party’s policies. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 00000000..656f7918 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,14 @@ + + +## Security + +Microsoft takes the security of our software products and services seriously, which +includes all source code repositories in our GitHub organizations. + +**Please do not report security vulnerabilities through public GitHub issues.** + +For security reporting information, locations, contact information, and policies, +please review the latest guidance for Microsoft repositories at +[https://aka.ms/SECURITY.md](https://aka.ms/SECURITY.md). + + diff --git a/cmake/gauxc-dep-versions.cmake b/cmake/gauxc-dep-versions.cmake index cd3969d8..8ab0aa11 100644 --- a/cmake/gauxc-dep-versions.cmake +++ b/cmake/gauxc-dep-versions.cmake @@ -11,13 +11,13 @@ set( GAUXC_CUTLASS_REPOSITORY https://github.com/NVIDIA/cutlass.git ) set( GAUXC_CUTLASS_REVISION v2.10.0 ) set( GAUXC_EXCHCXX_REPOSITORY https://github.com/wavefunction91/ExchCXX.git ) -set( GAUXC_EXCHCXX_REVISION 21a4700a826ec0beae1311a1d59677393bcb168f ) +set( GAUXC_EXCHCXX_REVISION v1.0.0 ) set( GAUXC_GAU2GRID_REPOSITORY https://github.com/dgasmith/gau2grid.git ) set( GAUXC_GAU2GRID_REVISION v2.0.6 ) set( GAUXC_INTEGRATORXX_REPOSITORY https://github.com/wavefunction91/IntegratorXX.git ) -set( GAUXC_INTEGRATORXX_REVISION ea07dedd37e7bd49ea06394eb811599002b34b49 ) +set( GAUXC_INTEGRATORXX_REVISION cf2917c64916583cef1081011beab3085b66e352 ) set( GAUXC_HIGHFIVE_REPOSITORY https://github.com/BlueBrain/HighFive.git ) set( GAUXC_HIGHFIVE_REVISION 805f0e13d09b47c4b01d40682621904aa3b31bb8 ) diff --git a/include/gauxc/atom.hpp b/include/gauxc/atom.hpp index 3f9771c0..72b0673b 100644 --- a/include/gauxc/atom.hpp +++ b/include/gauxc/atom.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/basisset.hpp b/include/gauxc/basisset.hpp index 9cef7ee7..c0c0f839 100644 --- a/include/gauxc/basisset.hpp +++ b/include/gauxc/basisset.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/basisset_map.hpp b/include/gauxc/basisset_map.hpp index ad9acb94..53f6d9d8 100644 --- a/include/gauxc/basisset_map.hpp +++ b/include/gauxc/basisset_map.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/enums.hpp b/include/gauxc/enums.hpp index ce7c19e1..76d4500c 100644 --- a/include/gauxc/enums.hpp +++ b/include/gauxc/enums.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,9 +19,10 @@ namespace GauXC { * Generally mapped to equivalent enums in IntegratorXX */ enum class RadialQuad { + Becke, ///< Becke radial quadrature MuraKnowles, ///< Mura-Knowles radial quadrature MurrayHandyLaming, ///< Murray-Handy-Laming radial quadrature - TreutlerAldrichs ///< Treutler-Aldrichs radial quadrature + TreutlerAhlrichs ///< Treutler-Ahlrichs radial quadrature }; /** @@ -29,8 +34,8 @@ enum class AtomicGridSizeDefault { FineGrid, ///< Fine grid (least accurate) UltraFineGrid, ///< Ultrafine grid (appropriate accuracy) SuperFineGrid, ///< Superfine grid (most accurate) - GM3, ///< Treutler-Aldrichs GM3 - GM5 ///< Treutlet-Aldrichs GM5 + GM3, ///< Treutler-Ahlrichs GM3 + GM5 ///< Treutlet-Ahlrichs GM5 }; /** @@ -38,6 +43,7 @@ enum class AtomicGridSizeDefault { * molecular integration */ enum class XCWeightAlg { + NOTPARTITIONED, ///< Not partitioned Becke, ///< The original Becke weighting scheme SSF, ///< The Stratmann-Scuseria-Frisch weighting scheme LKO ///< The Lauqua-Kuessman-Ochsenfeld weighting scheme diff --git a/include/gauxc/exceptions.hpp b/include/gauxc/exceptions.hpp index ac16bfbd..84b9b489 100644 --- a/include/gauxc/exceptions.hpp +++ b/include/gauxc/exceptions.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/external/cereal.hpp b/include/gauxc/external/cereal.hpp index c4cd1a90..ba0b6ef9 100644 --- a/include/gauxc/external/cereal.hpp +++ b/include/gauxc/external/cereal.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/external/hdf5.hpp b/include/gauxc/external/hdf5.hpp index 8d7ad01e..434d0893 100644 --- a/include/gauxc/external/hdf5.hpp +++ b/include/gauxc/external/hdf5.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/gauxc_config.hpp.in b/include/gauxc/gauxc_config.hpp.in index a7f0ce69..86fe7485 100644 --- a/include/gauxc/gauxc_config.hpp.in +++ b/include/gauxc/gauxc_config.hpp.in @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/grid.hpp b/include/gauxc/grid.hpp index 8a45e2f3..af7f8f2a 100644 --- a/include/gauxc/grid.hpp +++ b/include/gauxc/grid.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/grid_factory.hpp b/include/gauxc/grid_factory.hpp index 70ecbb88..ecf65526 100644 --- a/include/gauxc/grid_factory.hpp +++ b/include/gauxc/grid_factory.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -52,7 +56,7 @@ PrunedAtomicGridSpecification robust_psi4_pruning_scheme( UnprunedAtomicGridSpecification ); -/// Generate a Pruning specification according to the Treutler-Aldrichs scheme from an unpruned specification +/// Generate a Pruning specification according to the Treutler-Ahlrichs scheme from an unpruned specification PrunedAtomicGridSpecification treutler_pruning_scheme( UnprunedAtomicGridSpecification ); @@ -61,7 +65,7 @@ PrunedAtomicGridSpecification treutler_pruning_scheme( enum class PruningScheme { Unpruned, /// Unpruned atomic quadrature Robust, /// The "Robust" scheme of Psi4 - Treutler /// The Treutler-Aldrichs scheme + Treutler /// The Treutler-Ahlrichs scheme }; /// Generate a pruning specification from a specificed pruning scheme and diff --git a/include/gauxc/load_balancer.hpp b/include/gauxc/load_balancer.hpp index 738464f2..d420656a 100644 --- a/include/gauxc/load_balancer.hpp +++ b/include/gauxc/load_balancer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,6 +19,7 @@ #include #include #include +#include namespace GauXC { @@ -27,6 +32,8 @@ namespace detail { struct LoadBalancerState { bool modified_weights_are_stored = false; ///< Whether the load balancer currently stores partitioned weights + XCWeightAlg weight_alg = XCWeightAlg::NOTPARTITIONED; + ///< Weight partitioning scheme used by this LoadBalancer }; @@ -77,6 +84,9 @@ class LoadBalancer { /// Return internal timing tracker const util::Timer& get_timings() const; + /// Return the total number of points for local tasks + size_t total_npts() const; + /// Return the maximum number of points for local tasks size_t max_npts() const; diff --git a/include/gauxc/molecular_weights.hpp b/include/gauxc/molecular_weights.hpp index ed2e7d3f..74f1e922 100644 --- a/include/gauxc/molecular_weights.hpp +++ b/include/gauxc/molecular_weights.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/molecule.hpp b/include/gauxc/molecule.hpp index ce9aaa6a..9f4fe6a7 100644 --- a/include/gauxc/molecule.hpp +++ b/include/gauxc/molecule.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/molgrid.hpp b/include/gauxc/molgrid.hpp index 40bfd48d..d58dc494 100644 --- a/include/gauxc/molgrid.hpp +++ b/include/gauxc/molgrid.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/molgrid/defaults.hpp b/include/gauxc/molgrid/defaults.hpp index f3c3bf5e..0565647d 100644 --- a/include/gauxc/molgrid/defaults.hpp +++ b/include/gauxc/molgrid/defaults.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -14,6 +18,7 @@ namespace GauXC { double slater_radius_64(AtomicNumber); double slater_radius_30(AtomicNumber); double clementi_radius_67(AtomicNumber); + double uff_radius_103(AtomicNumber); double default_atomic_radius(AtomicNumber); RadialScale default_mk_radial_scaling_factor( AtomicNumber ); diff --git a/include/gauxc/molmeta.hpp b/include/gauxc/molmeta.hpp index cd6ee8d6..12918c6e 100644 --- a/include/gauxc/molmeta.hpp +++ b/include/gauxc/molmeta.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/named_type.hpp b/include/gauxc/named_type.hpp index 7034df8e..cf7a776c 100644 --- a/include/gauxc/named_type.hpp +++ b/include/gauxc/named_type.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/reduction_driver.hpp b/include/gauxc/reduction_driver.hpp index 1d9eaa96..f3bef188 100644 --- a/include/gauxc/reduction_driver.hpp +++ b/include/gauxc/reduction_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/runtime_environment.hpp b/include/gauxc/runtime_environment.hpp index 84edd290..4b0b08f5 100644 --- a/include/gauxc/runtime_environment.hpp +++ b/include/gauxc/runtime_environment.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/runtime_environment/decl.hpp b/include/gauxc/runtime_environment/decl.hpp index 5cc63fb0..424f9d98 100644 --- a/include/gauxc/runtime_environment/decl.hpp +++ b/include/gauxc/runtime_environment/decl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -77,6 +81,8 @@ class DeviceRuntimeEnvironment : public RuntimeEnvironment { bool owns_memory() const; DeviceBackend* device_backend() const; + void release_buffer(); + void set_buffer(void* m, size_t sz); }; #endif diff --git a/include/gauxc/runtime_environment/fwd.hpp b/include/gauxc/runtime_environment/fwd.hpp index 58910933..23f726e9 100644 --- a/include/gauxc/runtime_environment/fwd.hpp +++ b/include/gauxc/runtime_environment/fwd.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/shell.hpp b/include/gauxc/shell.hpp index f75a4949..7f27170c 100644 --- a/include/gauxc/shell.hpp +++ b/include/gauxc/shell.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -225,7 +229,6 @@ class alignas(256) Shell { }; -#if 0 template inline std::ostream& operator<<( std::ostream& os, const Shell& sh ) { os << "GauXC::Shell:( O={" @@ -234,9 +237,6 @@ inline std::ostream& operator<<( std::ostream& os, const Shell& sh ) { os << " "; os << " {l=" << sh.l() << ",sph=" << sh.pure() << "}"; os << std::endl; - os << " {cr=" << sh.cutoff_radius() << ",cv=" << sh.cutoff_val() - <<",mr=" << sh.max_radius() << ",mv=" << sh.max_val() << "}"; - os << std::endl; for(auto i=0ul; i& sh ) { return os; } -#endif } diff --git a/include/gauxc/shell_pair.hpp b/include/gauxc/shell_pair.hpp index e3288198..643ab8e3 100644 --- a/include/gauxc/shell_pair.hpp +++ b/include/gauxc/shell_pair.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/types.hpp b/include/gauxc/types.hpp index 2c1b7c8d..aad5d9ba 100644 --- a/include/gauxc/types.hpp +++ b/include/gauxc/types.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/constexpr_math.hpp b/include/gauxc/util/constexpr_math.hpp index bfe3be4d..3d8e9d87 100644 --- a/include/gauxc/util/constexpr_math.hpp +++ b/include/gauxc/util/constexpr_math.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/contiguous_container_data.hpp b/include/gauxc/util/contiguous_container_data.hpp index d29a6131..f5d35fd1 100644 --- a/include/gauxc/util/contiguous_container_data.hpp +++ b/include/gauxc/util/contiguous_container_data.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/div_ceil.hpp b/include/gauxc/util/div_ceil.hpp index 772d6c74..8a39aa67 100644 --- a/include/gauxc/util/div_ceil.hpp +++ b/include/gauxc/util/div_ceil.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/environment.hpp b/include/gauxc/util/environment.hpp index 953c5334..2a0a98d5 100644 --- a/include/gauxc/util/environment.hpp +++ b/include/gauxc/util/environment.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/gau_rad_eval.hpp b/include/gauxc/util/gau_rad_eval.hpp index 47f896bf..b2aa7f91 100644 --- a/include/gauxc/util/gau_rad_eval.hpp +++ b/include/gauxc/util/gau_rad_eval.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/geometry.hpp b/include/gauxc/util/geometry.hpp index 62992dca..97a8da2a 100644 --- a/include/gauxc/util/geometry.hpp +++ b/include/gauxc/util/geometry.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/misc.hpp b/include/gauxc/util/misc.hpp index 671f34c6..cf2ef8f0 100644 --- a/include/gauxc/util/misc.hpp +++ b/include/gauxc/util/misc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/real_solid_harmonics.hpp b/include/gauxc/util/real_solid_harmonics.hpp index 9501e557..3394da02 100644 --- a/include/gauxc/util/real_solid_harmonics.hpp +++ b/include/gauxc/util/real_solid_harmonics.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/timer.hpp b/include/gauxc/util/timer.hpp index a70d9298..545fa35b 100644 --- a/include/gauxc/util/timer.hpp +++ b/include/gauxc/util/timer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/util/unused.hpp b/include/gauxc/util/unused.hpp index cd993122..e6dd054a 100644 --- a/include/gauxc/util/unused.hpp +++ b/include/gauxc/util/unused.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/xc_integrator.hpp b/include/gauxc/xc_integrator.hpp index e08da39c..03feaf93 100644 --- a/include/gauxc/xc_integrator.hpp +++ b/include/gauxc/xc_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -36,6 +40,10 @@ class XCIntegrator { using exc_vxc_type_gks = std::tuple< value_type, matrix_type, matrix_type, matrix_type, matrix_type >; using exc_grad_type = std::vector< value_type >; using exx_type = matrix_type; + using fxc_contraction_type_rks = matrix_type; + using fxc_contraction_type_uks = std::tuple< matrix_type, matrix_type >; + using dd_psi_type = std::vector< value_type >; + using dd_psi_potential_type = matrix_type; private: @@ -66,11 +74,19 @@ class XCIntegrator { exc_vxc_type_gks eval_exc_vxc ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{}); - exc_grad_type eval_exc_grad( const MatrixType& ); + exc_grad_type eval_exc_grad( const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} ); + exc_grad_type eval_exc_grad( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& = IntegratorSettingsXC{} ); exx_type eval_exx ( const MatrixType&, const IntegratorSettingsEXX& = IntegratorSettingsEXX{} ); + fxc_contraction_type_rks eval_fxc_contraction ( const MatrixType&, const MatrixType&, + const IntegratorSettingsXC& = IntegratorSettingsXC{} ); + fxc_contraction_type_uks eval_fxc_contraction ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, + const IntegratorSettingsXC& = IntegratorSettingsXC{} ); + + dd_psi_type eval_dd_psi( const MatrixType&, unsigned ); + dd_psi_potential_type eval_dd_psi_potential( const MatrixType&, unsigned ); const util::Timer& get_timings() const; const LoadBalancer& load_balancer() const; diff --git a/include/gauxc/xc_integrator/impl.hpp b/include/gauxc/xc_integrator/impl.hpp index 85a655cc..400afb7c 100644 --- a/include/gauxc/xc_integrator/impl.hpp +++ b/include/gauxc/xc_integrator/impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -76,9 +80,16 @@ typename XCIntegrator::exc_vxc_type_gks template typename XCIntegrator::exc_grad_type - XCIntegrator::eval_exc_grad( const MatrixType& P ) { + XCIntegrator::eval_exc_grad( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_exc_grad(P, ks_settings); +}; + +template +typename XCIntegrator::exc_grad_type + XCIntegrator::eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) { if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); - return pimpl_->eval_exc_grad(P); + return pimpl_->eval_exc_grad(Ps, Pz, ks_settings); }; template @@ -89,6 +100,37 @@ typename XCIntegrator::exx_type return pimpl_->eval_exx(P,settings); }; +template +typename XCIntegrator::fxc_contraction_type_rks + XCIntegrator::eval_fxc_contraction( const MatrixType& P, const MatrixType& tP, + const IntegratorSettingsXC& ks_settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_fxc_contraction(P, tP, ks_settings); +}; + +template +typename XCIntegrator::fxc_contraction_type_uks + XCIntegrator::eval_fxc_contraction( const MatrixType& Ps, const MatrixType& Pz, + const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_fxc_contraction(Ps, Pz, tPs, tPz, ks_settings); +}; + +template +typename XCIntegrator::dd_psi_type + XCIntegrator::eval_dd_psi(const MatrixType& P, unsigned max_Ylm) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_dd_psi(P, max_Ylm); +} + +template +typename XCIntegrator::dd_psi_potential_type + XCIntegrator::eval_dd_psi_potential(const MatrixType& X, unsigned max_Ylm) { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->eval_dd_psi_potential(X, max_Ylm); +} + + template const util::Timer& XCIntegrator::get_timings() const { if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); diff --git a/include/gauxc/xc_integrator/integrator_factory.hpp b/include/gauxc/xc_integrator/integrator_factory.hpp index d63d23be..54a1c4a3 100644 --- a/include/gauxc/xc_integrator/integrator_factory.hpp +++ b/include/gauxc/xc_integrator/integrator_factory.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/xc_integrator/local_work_driver.hpp b/include/gauxc/xc_integrator/local_work_driver.hpp index bb37b319..50eb3d32 100644 --- a/include/gauxc/xc_integrator/local_work_driver.hpp +++ b/include/gauxc/xc_integrator/local_work_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/xc_integrator/replicated/impl.hpp b/include/gauxc/xc_integrator/replicated/impl.hpp index a892f5e3..bfc95fc8 100644 --- a/include/gauxc/xc_integrator/replicated/impl.hpp +++ b/include/gauxc/xc_integrator/replicated/impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -159,13 +163,27 @@ typename ReplicatedXCIntegrator::exc_vxc_type_gks template typename ReplicatedXCIntegrator::exc_grad_type - ReplicatedXCIntegrator::eval_exc_grad_( const MatrixType& P ) { + ReplicatedXCIntegrator::eval_exc_grad_( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) { if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); std::vector EXC_GRAD( 3*pimpl_->load_balancer().molecule().natoms() ); pimpl_->eval_exc_grad( P.rows(), P.cols(), P.data(), P.rows(), - EXC_GRAD.data() ); + EXC_GRAD.data(), ks_settings ); + + return EXC_GRAD; + +} + +template +typename ReplicatedXCIntegrator::exc_grad_type + ReplicatedXCIntegrator::eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) { + + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + + std::vector EXC_GRAD( 3*pimpl_->load_balancer().molecule().natoms() ); + pimpl_->eval_exc_grad( Ps.rows(), Ps.cols(), Ps.data(), Ps.rows(), Pz.data(), Pz.rows(), + EXC_GRAD.data(), ks_settings ); return EXC_GRAD; @@ -184,6 +202,67 @@ typename ReplicatedXCIntegrator::exx_type return K; +} +template +typename ReplicatedXCIntegrator::fxc_contraction_type_rks + ReplicatedXCIntegrator::eval_fxc_contraction_( const MatrixType& P, + const MatrixType& tP, const IntegratorSettingsXC& ks_settings ) { + + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + matrix_type FXC( P.rows(), P.cols() ); + + pimpl_->eval_fxc_contraction( P.rows(), P.cols(), P.data(), P.rows(), + tP.data(), tP.rows(), + FXC.data(), FXC.rows(), ks_settings ); + + return FXC; +} + +template +typename ReplicatedXCIntegrator::fxc_contraction_type_uks + ReplicatedXCIntegrator::eval_fxc_contraction_( const MatrixType& Ps, const MatrixType& Pz, + const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) { + + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + matrix_type FXCs( Ps.rows(), Ps.cols() ); + matrix_type FXCz( Pz.rows(), Pz.cols() ); + + pimpl_->eval_fxc_contraction( Ps.rows(), Ps.cols(), Ps.data(), Ps.rows(), + Pz.data(), Pz.rows(), + tPs.data(), tPs.rows(), + tPz.data(), tPz.rows(), + FXCs.data(), FXCs.rows(), + FXCz.data(), FXCz.rows(), ks_settings ); + + return std::make_tuple( FXCs, FXCz ); + +} + +template +typename ReplicatedXCIntegrator::dd_psi_type + ReplicatedXCIntegrator::eval_dd_psi_( const MatrixType& P, unsigned max_Ylm ) { + + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + + const size_t natoms = pimpl_->load_balancer().molecule().natoms(); + const size_t Ylm_sz = (max_Ylm + 1) * ( max_Ylm + 1); + std::vector ddPsi(natoms * Ylm_sz, 0.0); + pimpl_->eval_dd_psi(P.rows(), P.cols(), P.data(), P.rows(), max_Ylm, ddPsi.data(), Ylm_sz); + return ddPsi; +} + +template +typename ReplicatedXCIntegrator::dd_psi_potential_type + ReplicatedXCIntegrator::eval_dd_psi_potential_( const MatrixType& X, unsigned max_Ylm ) { + + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + + const size_t nbf = pimpl_->load_balancer().basis().nbf(); + matrix_type Vddx(nbf, nbf); + Vddx.setZero(); + pimpl_->eval_dd_psi_potential(X.rows(), X.cols(), X.data(), max_Ylm, Vddx.data()); + return Vddx; + } } diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp index 4721243c..9454e60e 100644 --- a/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp +++ b/include/gauxc/xc_integrator/replicated/replicated_xc_device_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp index 57685599..4f3476f1 100644 --- a/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp +++ b/include/gauxc/xc_integrator/replicated/replicated_xc_host_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp index d75a92d8..dc881b1f 100644 --- a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp +++ b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_factory.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp index 70c33db5..45731512 100644 --- a/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp +++ b/include/gauxc/xc_integrator/replicated/replicated_xc_integrator_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -74,11 +78,30 @@ class ReplicatedXCIntegratorImpl { value_type* VXCx, int64_t ldvxcx, value_type* EXC, const IntegratorSettingsXC& ks_settings ) = 0; - virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) = 0; + virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, + value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) = 0; + virtual void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldps, + const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) = 0; virtual void eval_exx_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* K, int64_t ldk, const IntegratorSettingsEXX& settings ) = 0; + virtual void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings )=0; + virtual void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings )=0; + virtual void eval_dd_psi_( int64_t m, int64_t n, const value_type* P, int64_t ldp, unsigned max_Ylm, + value_type* ddPsi, int64_t ldPsi ) = 0; + virtual void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, + value_type* Vddx) = 0; public: @@ -130,13 +153,36 @@ class ReplicatedXCIntegratorImpl { value_type* EXC, const IntegratorSettingsXC& ks_settings ); - void eval_exc_grad( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ); + void eval_exc_grad( int64_t m, int64_t n, const value_type* P, int64_t ldp, + value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ); + void eval_exc_grad( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ); void eval_exx( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* K, int64_t ldk, const IntegratorSettingsEXX& settings ); + void eval_fxc_contraction( int64_t m, int64_t n, const value_type* P, + int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ); + + void eval_fxc_contraction( int64_t m, int64_t n, const value_type* Ps, + int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ); + + void eval_dd_psi( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, + value_type* ddPsi, int64_t ldPsi ); + void eval_dd_psi_potential( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, + value_type* Vddx ); + inline const util::Timer& get_timings() const { return timer_; } inline std::unique_ptr< LocalWorkDriver > release_local_work_driver() { diff --git a/include/gauxc/xc_integrator/replicated_xc_integrator.hpp b/include/gauxc/xc_integrator/replicated_xc_integrator.hpp index ac93a4f0..1ca53f91 100644 --- a/include/gauxc/xc_integrator/replicated_xc_integrator.hpp +++ b/include/gauxc/xc_integrator/replicated_xc_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -33,6 +37,10 @@ class ReplicatedXCIntegrator : public XCIntegratorImpl { using exc_vxc_type_gks = typename XCIntegratorImpl::exc_vxc_type_gks; using exc_grad_type = typename XCIntegratorImpl::exc_grad_type; using exx_type = typename XCIntegratorImpl::exx_type; + using fxc_contraction_type_rks = typename XCIntegratorImpl::fxc_contraction_type_rks; + using fxc_contraction_type_uks = typename XCIntegratorImpl::fxc_contraction_type_uks; + using dd_psi_type = typename XCIntegratorImpl::dd_psi_type; + using dd_psi_potential_type = typename XCIntegratorImpl::dd_psi_potential_type; private: @@ -46,8 +54,13 @@ class ReplicatedXCIntegrator : public XCIntegratorImpl { exc_vxc_type_rks eval_exc_vxc_ ( const MatrixType&, const IntegratorSettingsXC& ) override; exc_vxc_type_uks eval_exc_vxc_ ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC&) override; exc_vxc_type_gks eval_exc_vxc_ ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override; - exc_grad_type eval_exc_grad_( const MatrixType& ) override; + exc_grad_type eval_exc_grad_( const MatrixType&, const IntegratorSettingsXC& ) override; + exc_grad_type eval_exc_grad_( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override; exx_type eval_exx_ ( const MatrixType&, const IntegratorSettingsEXX& ) override; + fxc_contraction_type_rks eval_fxc_contraction_ ( const MatrixType&, const MatrixType&, const IntegratorSettingsXC& ) override; + fxc_contraction_type_uks eval_fxc_contraction_ ( const MatrixType&, const MatrixType&, const MatrixType&, const MatrixType&, const IntegratorSettingsXC&) override; + dd_psi_type eval_dd_psi_( const MatrixType& , unsigned ) override; + dd_psi_potential_type eval_dd_psi_potential_( const MatrixType& , unsigned ) override; const util::Timer& get_timings_() const override; const LoadBalancer& get_load_balancer_() const override; LoadBalancer& get_load_balancer_() override; diff --git a/include/gauxc/xc_integrator/xc_integrator_impl.hpp b/include/gauxc/xc_integrator/xc_integrator_impl.hpp index 1406bf8e..ba7bebeb 100644 --- a/include/gauxc/xc_integrator/xc_integrator_impl.hpp +++ b/include/gauxc/xc_integrator/xc_integrator_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -25,6 +29,10 @@ class XCIntegratorImpl { using exc_vxc_type_gks = typename XCIntegrator::exc_vxc_type_gks; using exc_grad_type = typename XCIntegrator::exc_grad_type; using exx_type = typename XCIntegrator::exx_type; + using fxc_contraction_type_rks = typename XCIntegrator::fxc_contraction_type_rks; + using fxc_contraction_type_uks = typename XCIntegrator::fxc_contraction_type_uks; + using dd_psi_type = typename XCIntegrator::dd_psi_type; + using dd_psi_potential_type = typename XCIntegrator::dd_psi_potential_type; protected: @@ -38,9 +46,18 @@ class XCIntegratorImpl { virtual exc_vxc_type_uks eval_exc_vxc_ ( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) = 0; virtual exc_vxc_type_gks eval_exc_vxc_ ( const MatrixType& Ps, const MatrixType& Pz, const MatrixType& Py, const MatrixType& Px, const IntegratorSettingsXC& ks_settings ) = 0; - virtual exc_grad_type eval_exc_grad_( const MatrixType& P ) = 0; + virtual exc_grad_type eval_exc_grad_( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) = 0; + virtual exc_grad_type eval_exc_grad_( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) = 0; virtual exx_type eval_exx_ ( const MatrixType& P, const IntegratorSettingsEXX& settings ) = 0; + virtual fxc_contraction_type_rks eval_fxc_contraction_ ( const MatrixType& P, + const MatrixType& tP, const IntegratorSettingsXC& ks_settings ) = 0; + virtual fxc_contraction_type_uks eval_fxc_contraction_ ( const MatrixType& Ps, const MatrixType& Pz, + const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) = 0; + + + virtual dd_psi_type eval_dd_psi_( const MatrixType& P, unsigned max_Ylm ) = 0; + virtual dd_psi_potential_type eval_dd_psi_potential_( const MatrixType& X, unsigned max_Ylm ) = 0; virtual const util::Timer& get_timings_() const = 0; virtual const LoadBalancer& get_load_balancer_() const = 0; virtual LoadBalancer& get_load_balancer_() = 0; @@ -108,14 +125,21 @@ class XCIntegratorImpl { } /** Integrate EXC gradient for RKS - * - * TODO: add API for UKS/GKS * * @param[in] P The alpha density matrix * @returns EXC gradient */ - exc_grad_type eval_exc_grad( const MatrixType& P ) { - return eval_exc_grad_(P); + exc_grad_type eval_exc_grad( const MatrixType& P, const IntegratorSettingsXC& ks_settings ) { + return eval_exc_grad_(P, ks_settings); + } + + /** Integrate EXC gradient for UKS + * + * @param[in] P The alpha density matrix + * @returns EXC gradient + */ + exc_grad_type eval_exc_grad( const MatrixType& Ps, const MatrixType& Pz, const IntegratorSettingsXC& ks_settings ) { + return eval_exc_grad_(Ps, Pz, ks_settings); } /** Integrate Exact Exchange for RHF @@ -127,6 +151,50 @@ class XCIntegratorImpl { return eval_exx_(P,settings); } + + /** Integrate FXC contraction for RKS + * + * @param[in] P the alpha density matrix + * @param[in] tP the alpha trial density matrix (contructed from purturbed MO coefficients) + * @returns FXC contraction + */ + fxc_contraction_type_rks eval_fxc_contraction( const MatrixType& P, const MatrixType& tP, const IntegratorSettingsXC& ks_settings ) { + return eval_fxc_contraction_(P, tP, ks_settings); + } + + /** Integrate FXC contraction for UKS + * + * @param[in] Ps the scalar density matrix (Pa + Pb) + * @param[in] Pz the Z density matrix (Pa - Pb) + * @param[in] tPs the trial scalar density matrices (contructed from purturbed MO coefficients) + * @param[in] tPz the trial Z density matrices (contructed from purturbed MO coefficients) + * @returns FXC contraction + */ + fxc_contraction_type_uks eval_fxc_contraction( const MatrixType& Ps, const MatrixType& Pz, + const MatrixType& tPs, const MatrixType& tPz, const IntegratorSettingsXC& ks_settings ) { + return eval_fxc_contraction_(Ps, Pz, tPs, tPz, ks_settings); + } + + /** Evaluate Psi vector for ddX + * + * @param[in] P The density matrix + * @param[in] max_Ylm The max "l" degree for Ylm + * @returns The atomic contributions to the SH projection of the density onto the DD domains + */ + dd_psi_type eval_dd_psi( const MatrixType& P, unsigned max_Ylm ) { + return eval_dd_psi_(P,max_Ylm); + } + + /** Evaluate Psi Potential for ddX + * + * @param[in] X The local ASC coefficients, (nharmonics, atom) array in column-major ordering. + * @param[in] max_Ylm The max "l" degree for Ylm + * @returns fock contributions + */ + dd_psi_potential_type eval_dd_psi_potential( const MatrixType& X, unsigned max_Ylm ) { + return eval_dd_psi_potential_(X,max_Ylm); + } + /** Get internal timers * * @returns Timer instance for internal timings diff --git a/include/gauxc/xc_integrator_settings.hpp b/include/gauxc/xc_integrator_settings.hpp index dc90cc61..1ec26d0e 100644 --- a/include/gauxc/xc_integrator_settings.hpp +++ b/include/gauxc/xc_integrator_settings.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -21,4 +25,8 @@ struct IntegratorSettingsKS : public IntegratorSettingsXC { double gks_dtol = 1e-12; }; +struct IntegratorSettingsEXC_GRAD : public IntegratorSettingsKS { + bool include_weight_derivatives= true; // whether to include grid weight contribution and employ translational invariance, or just use Hellmann-Feynman gradient +}; + } diff --git a/include/gauxc/xc_task.hpp b/include/gauxc/xc_task.hpp index 1f70418f..630d6dd6 100644 --- a/include/gauxc/xc_task.hpp +++ b/include/gauxc/xc_task.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7e51e9fa..27909d7f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/atomic_radii.cxx b/src/atomic_radii.cxx index 6e3a829f..52275309 100644 --- a/src/atomic_radii.cxx +++ b/src/atomic_radii.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -318,4 +322,31 @@ double clementi_radius_67(AtomicNumber _Z) { } +// UFF atomic radii +// Atomic radii derived from the universal force field +// A. K. Rappe et. al. J. Am. Chem. Soc., 1992, 114 (25), pp 10024-10035 +// https://doi.org/10.1021/ja00051a040, data given in Angström, +// will be converted to Bohr. Note that keys are normalised to lower case. +const std::vector radius_uff_list = {1.443, 1.81, 1.2255, 1.3725, 2.0415, 1.9255, 1.83, 1.75, + 1.682, 1.6215, 1.4915, 1.5105, + 2.2495, 2.1475, 2.0735, 2.0175, 1.9735, 1.934, 1.906, 1.6995, 1.6475, + 1.5875, 1.572, 1.5115, 1.4805, 1.456, 1.436, 1.417, 1.7475, + 1.3815, 2.1915, 2.14, 2.115, 2.1025, 2.0945, 2.0705, 2.057, + 1.8205, 1.6725, 1.562, 1.5825, 1.526, 1.499, 1.4815, 1.4645, + 1.4495, 1.574, 1.424, 2.2315, 2.196, 2.21, 2.235, 2.25, 2.202, + 2.2585, 1.8515, 1.761, 1.778, 1.803, 1.7875, 1.7735, 1.76, 1.7465, + 1.684, 1.7255, 1.714, 1.7045, 1.6955, 1.687, 1.6775, 1.82, 1.5705, + 1.585, 1.5345, 1.477, 1.56, 1.42, 1.377, 1.6465, 1.3525, 2.1735, 2.1485, + 2.185, 2.3545, 2.375, 2.3825, 2.45, 1.8385, 1.739, 1.698, 1.712, 1.6975, + 1.712, 1.712, 1.6905, 1.663, 1.6695, 1.6565, 1.6495, 1.643, 1.637, 1.624, 1.618}; + +double uff_radius_103(AtomicNumber _Z) { + const double RADIUS_UFF_SCALING = 1.1; + const double DDX_BOHR_TO_ANGSTROM = 0.52917721092; + auto Z = _Z.get(); + if (Z < 0 || Z >= radius_uff_list.size()) { + return -1.; + } + return radius_uff_list[Z-1] * RADIUS_UFF_SCALING / DDX_BOHR_TO_ANGSTROM; +} } diff --git a/src/exceptions/cublas_exception.hpp b/src/exceptions/cublas_exception.hpp index 84fc3c31..503fc900 100644 --- a/src/exceptions/cublas_exception.hpp +++ b/src/exceptions/cublas_exception.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/exceptions/cuda_exception.hpp b/src/exceptions/cuda_exception.hpp index 15ae3d3d..6d4767d1 100644 --- a/src/exceptions/cuda_exception.hpp +++ b/src/exceptions/cuda_exception.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/exceptions/cutlass_exception.hpp b/src/exceptions/cutlass_exception.hpp index 49a3192d..4de854be 100644 --- a/src/exceptions/cutlass_exception.hpp +++ b/src/exceptions/cutlass_exception.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/exceptions/hip_exception.hpp b/src/exceptions/hip_exception.hpp index 1fd22dbe..08a40302 100644 --- a/src/exceptions/hip_exception.hpp +++ b/src/exceptions/hip_exception.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/exceptions/hipblas_exception.hpp b/src/exceptions/hipblas_exception.hpp index 9bb39011..bb89a331 100644 --- a/src/exceptions/hipblas_exception.hpp +++ b/src/exceptions/hipblas_exception.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/exceptions/magma_exception.hpp b/src/exceptions/magma_exception.hpp index bb0e40ec..30056573 100644 --- a/src/exceptions/magma_exception.hpp +++ b/src/exceptions/magma_exception.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/external/CMakeLists.txt b/src/external/CMakeLists.txt index 3df13b30..fa1f7f37 100644 --- a/src/external/CMakeLists.txt +++ b/src/external/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/external/hdf5_read.cxx b/src/external/hdf5_read.cxx index cae9c865..c01424c4 100644 --- a/src/external/hdf5_read.cxx +++ b/src/external/hdf5_read.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/external/hdf5_util.hpp b/src/external/hdf5_util.hpp index 5c5cb696..9569734f 100644 --- a/src/external/hdf5_util.hpp +++ b/src/external/hdf5_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/external/hdf5_write.cxx b/src/external/hdf5_write.cxx index f782ef4a..cbf8bf04 100644 --- a/src/external/hdf5_write.cxx +++ b/src/external/hdf5_write.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/grid.cxx b/src/grid.cxx index 3f167270..fed7972f 100644 --- a/src/grid.cxx +++ b/src/grid.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/grid_factory.cxx b/src/grid_factory.cxx index 92cc5314..1836653e 100644 --- a/src/grid_factory.cxx +++ b/src/grid_factory.cxx @@ -1,16 +1,21 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #include -#include -#include -#include -#include +#include +#include +#include +#include +#include #include #include @@ -30,6 +35,7 @@ Grid AtomicGridFactory::generate_grid( atomic_grid_variant gs, BatchSize bsz ) { Grid AtomicGridFactory::generate_unpruned_grid( RadialQuad rq, RadialSize nrad, AngularSize nang, RadialScale rscal, BatchSize bsz) { + using bk_type = IntegratorXX::Becke; using mk_type = IntegratorXX::MuraKnowles; using mhl_type = IntegratorXX::MurrayHandyLaming; using ta_type = IntegratorXX::TreutlerAhlrichs; @@ -38,6 +44,9 @@ Grid AtomicGridFactory::generate_unpruned_grid( RadialQuad rq, RadialSize nrad, ll_type ang_quad( nang.get() ); switch( rq ) { + case RadialQuad::Becke: + return generate_unpruned_grid( bk_type(nrad.get(), rscal.get()), + std::move(ang_quad), bsz ); case RadialQuad::MuraKnowles: return generate_unpruned_grid( mk_type(nrad.get(), rscal.get()), @@ -47,7 +56,7 @@ Grid AtomicGridFactory::generate_unpruned_grid( RadialQuad rq, RadialSize nrad, return generate_unpruned_grid( mhl_type(nrad.get(), rscal.get()), std::move(ang_quad), bsz ); - case RadialQuad::TreutlerAldrichs: + case RadialQuad::TreutlerAhlrichs: return generate_unpruned_grid( ta_type(nrad.get(), rscal.get()), std::move(ang_quad), bsz ); @@ -113,12 +122,18 @@ Grid AtomicGridFactory::generate_pruned_grid( RadialQuad rq, return generate_pruned_grid(std::move(rg), std::move(rgp), bsz); } - case RadialQuad::TreutlerAldrichs: + case RadialQuad::TreutlerAhlrichs: { auto [rg, rgp] = make_pruned_grid( nrad, pruning_regions, rscal ); return generate_pruned_grid(std::move(rg), std::move(rgp), bsz); } + case RadialQuad::Becke: + { + auto[rg, rgp] = + make_pruned_grid>( nrad, pruning_regions, rscal ); + return generate_pruned_grid(std::move(rg), std::move(rgp), bsz); + } default: GAUXC_GENERIC_EXCEPTION("Unsupported Radial Quadrature"); @@ -145,17 +160,18 @@ PrunedAtomicGridSpecification robust_psi4_pruning_scheme( // Look up order // XXX: THIS ONLY WORKS FOR LEBEDEV - using namespace IntegratorXX::detail::lebedev; + using angular_type = IntegratorXX::LebedevLaikov; + using traits = IntegratorXX::quadrature_traits; const auto asz = unp.angular_size.get(); - const auto base_order = algebraic_order_by_npts(asz); + const auto base_order = traits::algebraic_order_by_npts(asz); if( base_order < 0 ) GAUXC_GENERIC_EXCEPTION("Invalid Base Grid"); const auto med_order = - next_algebraic_order(base_order > 6 ? base_order-6 : base_order); + traits::next_algebraic_order(base_order > 6 ? base_order-6 : base_order); const auto low_order = 7; - AngularSize med_sz(npts_by_algebraic_order(med_order)); - AngularSize low_sz(npts_by_algebraic_order(low_order)); + AngularSize med_sz(traits::npts_by_algebraic_order(med_order)); + AngularSize low_sz(traits::npts_by_algebraic_order(low_order)); // Create Pruning Regions const size_t rsz = unp.radial_size.get(); @@ -183,9 +199,11 @@ PrunedAtomicGridSpecification treutler_pruning_scheme( // Look up order // XXX: THIS ONLY WORKS FOR LEBEDEV - using namespace IntegratorXX::detail::lebedev; - AngularSize med_sz(npts_by_algebraic_order(med_order)); - AngularSize low_sz(npts_by_algebraic_order(low_order)); + using angular_type = IntegratorXX::LebedevLaikov; + using traits = IntegratorXX::quadrature_traits; + + AngularSize med_sz(traits::npts_by_algebraic_order(med_order)); + AngularSize low_sz(traits::npts_by_algebraic_order(low_order)); // Create Pruning Regions const size_t rsz = unp.radial_size.get(); diff --git a/src/grid_impl.cxx b/src/grid_impl.cxx index 07bb4eaa..069dadba 100644 --- a/src/grid_impl.cxx +++ b/src/grid_impl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/grid_impl.hpp b/src/grid_impl.hpp index 3566d9f7..29b88c9f 100644 --- a/src/grid_impl.hpp +++ b/src/grid_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/CMakeLists.txt b/src/load_balancer/CMakeLists.txt index 457f2c6b..3dca6a6a 100644 --- a/src/load_balancer/CMakeLists.txt +++ b/src/load_balancer/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/load_balancer/device/CMakeLists.txt b/src/load_balancer/device/CMakeLists.txt index 6bd03e77..00b5dee2 100644 --- a/src/load_balancer/device/CMakeLists.txt +++ b/src/load_balancer/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/load_balancer/device/cuda/CMakeLists.txt b/src/load_balancer/device/cuda/CMakeLists.txt index 04632cf4..49e0d17c 100644 --- a/src/load_balancer/device/cuda/CMakeLists.txt +++ b/src/load_balancer/device/cuda/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/load_balancer/device/cuda/cuda_collision_detection.cu b/src/load_balancer/device/cuda/cuda_collision_detection.cu index 6292d67c..7da69c72 100644 --- a/src/load_balancer/device/cuda/cuda_collision_detection.cu +++ b/src/load_balancer/device/cuda/cuda_collision_detection.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/cuda/cuda_collision_detection.hpp b/src/load_balancer/device/cuda/cuda_collision_detection.hpp index 9b09712a..d7156496 100644 --- a/src/load_balancer/device/cuda/cuda_collision_detection.hpp +++ b/src/load_balancer/device/cuda/cuda_collision_detection.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx index 15d33900..af2199b7 100644 --- a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx +++ b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp index 7e0a0d65..585edde3 100644 --- a/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp +++ b/src/load_balancer/device/cuda/replicated_cuda_load_balancer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/hip/CMakeLists.txt b/src/load_balancer/device/hip/CMakeLists.txt index e13db053..cd1e4aaf 100644 --- a/src/load_balancer/device/hip/CMakeLists.txt +++ b/src/load_balancer/device/hip/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/load_balancer/device/hip/hip_collision_detection.hip b/src/load_balancer/device/hip/hip_collision_detection.hip index 83ee78d6..89d0978f 100644 --- a/src/load_balancer/device/hip/hip_collision_detection.hip +++ b/src/load_balancer/device/hip/hip_collision_detection.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/hip/hip_collision_detection.hpp b/src/load_balancer/device/hip/hip_collision_detection.hpp index db73bdd8..a191d18f 100644 --- a/src/load_balancer/device/hip/hip_collision_detection.hpp +++ b/src/load_balancer/device/hip/hip_collision_detection.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx b/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx index c4475157..ac693e1e 100644 --- a/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx +++ b/src/load_balancer/device/hip/replicated_hip_load_balancer.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp b/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp index 7e0a0d65..585edde3 100644 --- a/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp +++ b/src/load_balancer/device/hip/replicated_hip_load_balancer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/load_balancer_device_factory.cxx b/src/load_balancer/device/load_balancer_device_factory.cxx index 606a6b57..e481f4f2 100644 --- a/src/load_balancer/device/load_balancer_device_factory.cxx +++ b/src/load_balancer/device/load_balancer_device_factory.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/device/load_balancer_device_factory.hpp b/src/load_balancer/device/load_balancer_device_factory.hpp index c28535c2..f61a3f44 100644 --- a/src/load_balancer/device/load_balancer_device_factory.hpp +++ b/src/load_balancer/device/load_balancer_device_factory.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/fillin_replicated_load_balancer.cxx b/src/load_balancer/host/fillin_replicated_load_balancer.cxx index 04e2f908..a84d40d3 100644 --- a/src/load_balancer/host/fillin_replicated_load_balancer.cxx +++ b/src/load_balancer/host/fillin_replicated_load_balancer.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/fillin_replicated_load_balancer.hpp b/src/load_balancer/host/fillin_replicated_load_balancer.hpp index c6df5f94..eb40cefb 100644 --- a/src/load_balancer/host/fillin_replicated_load_balancer.hpp +++ b/src/load_balancer/host/fillin_replicated_load_balancer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/load_balancer_host_factory.cxx b/src/load_balancer/host/load_balancer_host_factory.cxx index 94db35de..f69d7fd9 100644 --- a/src/load_balancer/host/load_balancer_host_factory.cxx +++ b/src/load_balancer/host/load_balancer_host_factory.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/load_balancer_host_factory.hpp b/src/load_balancer/host/load_balancer_host_factory.hpp index d02e5c03..ae878678 100644 --- a/src/load_balancer/host/load_balancer_host_factory.hpp +++ b/src/load_balancer/host/load_balancer_host_factory.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/petite_replicated_load_balancer.cxx b/src/load_balancer/host/petite_replicated_load_balancer.cxx index 0bef2b83..3ecc53c1 100644 --- a/src/load_balancer/host/petite_replicated_load_balancer.cxx +++ b/src/load_balancer/host/petite_replicated_load_balancer.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/petite_replicated_load_balancer.hpp b/src/load_balancer/host/petite_replicated_load_balancer.hpp index 02b8a8bf..8c339699 100644 --- a/src/load_balancer/host/petite_replicated_load_balancer.hpp +++ b/src/load_balancer/host/petite_replicated_load_balancer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/host/replicated_host_load_balancer.cxx b/src/load_balancer/host/replicated_host_load_balancer.cxx index 5a3bb9c9..8f05f186 100644 --- a/src/load_balancer/host/replicated_host_load_balancer.cxx +++ b/src/load_balancer/host/replicated_host_load_balancer.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -123,7 +127,7 @@ std::vector< XCTask > HostReplicatedLoadBalancer::create_local_tasks_() const { } // Loop over Atoms -//return local_work; +// return local_work; // Lexicographic ordering of tasks auto task_order = []( const auto& a, const auto& b ) { diff --git a/src/load_balancer/host/replicated_host_load_balancer.hpp b/src/load_balancer/host/replicated_host_load_balancer.hpp index dfd8f319..9b4d0a08 100644 --- a/src/load_balancer/host/replicated_host_load_balancer.hpp +++ b/src/load_balancer/host/replicated_host_load_balancer.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/load_balancer.cxx b/src/load_balancer/load_balancer.cxx index 9329fef8..637e1d8b 100644 --- a/src/load_balancer/load_balancer.cxx +++ b/src/load_balancer/load_balancer.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -51,6 +55,10 @@ const util::Timer& LoadBalancer::get_timings() const { return pimpl_->get_timings(); } +size_t LoadBalancer::total_npts() const { + if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); + return pimpl_->total_npts(); +} size_t LoadBalancer::max_npts() const { if( not pimpl_ ) GAUXC_PIMPL_NOT_INITIALIZED(); return pimpl_->max_npts(); diff --git a/src/load_balancer/load_balancer_factory.cxx b/src/load_balancer/load_balancer_factory.cxx index b14ddbee..bdc2898e 100644 --- a/src/load_balancer/load_balancer_factory.cxx +++ b/src/load_balancer/load_balancer_factory.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/load_balancer/load_balancer_impl.cxx b/src/load_balancer/load_balancer_impl.cxx index 06dbbd19..f6b853da 100644 --- a/src/load_balancer/load_balancer_impl.cxx +++ b/src/load_balancer/load_balancer_impl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -59,6 +63,14 @@ const util::Timer& LoadBalancerImpl::get_timings() const { } +size_t LoadBalancerImpl::total_npts() const { + + return std::accumulate( local_tasks_.cbegin(), local_tasks_.cend(), 0ul, + []( const auto& a, const auto& b ) { + return a + b.points.size(); + }); + +} size_t LoadBalancerImpl::max_npts() const { if( not local_tasks_.size() ) return 0ul; diff --git a/src/load_balancer/load_balancer_impl.hpp b/src/load_balancer/load_balancer_impl.hpp index 566279a1..53c75865 100644 --- a/src/load_balancer/load_balancer_impl.hpp +++ b/src/load_balancer/load_balancer_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -63,6 +67,7 @@ class LoadBalancerImpl { const util::Timer& get_timings() const; + size_t total_npts() const; size_t max_npts() const; size_t max_nbe() const; size_t max_npts_x_nbe() const; diff --git a/src/load_balancer/rebalance.cxx b/src/load_balancer/rebalance.cxx index 91898b24..3879f199 100644 --- a/src/load_balancer/rebalance.cxx +++ b/src/load_balancer/rebalance.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molecular_weights/CMakeLists.txt b/src/molecular_weights/CMakeLists.txt index be0102c9..e9ce4e19 100644 --- a/src/molecular_weights/CMakeLists.txt +++ b/src/molecular_weights/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/molecular_weights/device/CMakeLists.txt b/src/molecular_weights/device/CMakeLists.txt index 89f2e279..15f5fe67 100644 --- a/src/molecular_weights/device/CMakeLists.txt +++ b/src/molecular_weights/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/molecular_weights/device/device_molecular_weights.cxx b/src/molecular_weights/device/device_molecular_weights.cxx index 42d81d7b..c5bcce5c 100644 --- a/src/molecular_weights/device/device_molecular_weights.cxx +++ b/src/molecular_weights/device/device_molecular_weights.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -37,7 +41,7 @@ void DeviceMolecularWeights::modify_weights( LoadBalancer& lb ) const { auto task_comparator = []( const XCTask& a, const XCTask& b ) { return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); }; - std::sort(task_begin, task_end, task_comparator ); + std::stable_sort(task_begin, task_end, task_comparator ); const auto& mol = lb.molecule(); const auto natoms = mol.natoms(); @@ -79,6 +83,7 @@ void DeviceMolecularWeights::modify_weights( LoadBalancer& lb ) const { rt.device_backend()->master_queue_synchronize(); lb.state().modified_weights_are_stored = true; + lb.state().weight_alg = this->settings_.weight_alg; } diff --git a/src/molecular_weights/device/device_molecular_weights.hpp b/src/molecular_weights/device/device_molecular_weights.hpp index 69c5da11..d4cd202d 100644 --- a/src/molecular_weights/device/device_molecular_weights.hpp +++ b/src/molecular_weights/device/device_molecular_weights.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molecular_weights/host/CMakeLists.txt b/src/molecular_weights/host/CMakeLists.txt index d4399b51..889f10e5 100644 --- a/src/molecular_weights/host/CMakeLists.txt +++ b/src/molecular_weights/host/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/molecular_weights/host/host_molecular_weights.cxx b/src/molecular_weights/host/host_molecular_weights.cxx index efe075fb..e722d22b 100644 --- a/src/molecular_weights/host/host_molecular_weights.cxx +++ b/src/molecular_weights/host/host_molecular_weights.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -25,7 +29,7 @@ void HostMolecularWeights::modify_weights( LoadBalancer& lb ) const { auto task_comparator = []( const XCTask& a, const XCTask& b ) { return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); }; - std::sort( tasks.begin(), tasks.end(), task_comparator ); + std::stable_sort( tasks.begin(), tasks.end(), task_comparator ); // Modify the weights const auto& mol = lb.molecule(); @@ -34,6 +38,7 @@ void HostMolecularWeights::modify_weights( LoadBalancer& lb ) const { tasks.begin(), tasks.end() ); lb.state().modified_weights_are_stored = true; + lb.state().weight_alg = this->settings_.weight_alg; } } diff --git a/src/molecular_weights/host/host_molecular_weights.hpp b/src/molecular_weights/host/host_molecular_weights.hpp index 4ce87b0d..2a037951 100644 --- a/src/molecular_weights/host/host_molecular_weights.hpp +++ b/src/molecular_weights/host/host_molecular_weights.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molecular_weights/molecular_weights.cxx b/src/molecular_weights/molecular_weights.cxx index e0c4a660..d65ccd90 100644 --- a/src/molecular_weights/molecular_weights.cxx +++ b/src/molecular_weights/molecular_weights.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molecular_weights/molecular_weights_impl.hpp b/src/molecular_weights/molecular_weights_impl.hpp index ba838e34..7c4b5ed6 100644 --- a/src/molecular_weights/molecular_weights_impl.hpp +++ b/src/molecular_weights/molecular_weights_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molgrid.cxx b/src/molgrid.cxx index cc138b6b..f9ee6e51 100644 --- a/src/molgrid.cxx +++ b/src/molgrid.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molgrid_defaults.cxx b/src/molgrid_defaults.cxx index c1d0fc52..61b66830 100644 --- a/src/molgrid_defaults.cxx +++ b/src/molgrid_defaults.cxx @@ -1,13 +1,17 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #include #include -#include +#include namespace GauXC { @@ -83,11 +87,19 @@ RadialScale default_mhl_radial_scaling_factor( AtomicNumber _Z ) { return RadialScale( default_atomic_radius(_Z) * fac ); } +RadialScale default_bk_radial_scaling_factor( AtomicNumber _Z ) { + auto Z = _Z.get(); + const double fac = (Z!=1) ? 0.5 : 1.0; + return RadialScale( default_atomic_radius(_Z) * fac ); +} + RadialScale default_radial_scaling_factor(RadialQuad rq, AtomicNumber Z) { if( rq == RadialQuad::MuraKnowles ) return default_mk_radial_scaling_factor(Z); - else if( rq == RadialQuad::TreutlerAldrichs ) + else if( rq == RadialQuad::TreutlerAhlrichs ) return default_ta_radial_scaling_factor(Z); + else if( rq == RadialQuad::Becke ) + return default_bk_radial_scaling_factor(Z); else // MHL return default_mhl_radial_scaling_factor(Z); } diff --git a/src/molgrid_impl.cxx b/src/molgrid_impl.cxx index 188f6473..c6939e3e 100644 --- a/src/molgrid_impl.cxx +++ b/src/molgrid_impl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molgrid_impl.hpp b/src/molgrid_impl.hpp index 5455f21a..e8c0590e 100644 --- a/src/molgrid_impl.hpp +++ b/src/molgrid_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/molmeta.cxx b/src/molmeta.cxx index fc770a9d..3bad9987 100644 --- a/src/molmeta.cxx +++ b/src/molmeta.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/CMakeLists.txt b/src/reduction_driver/CMakeLists.txt index c991ad18..07ca9494 100644 --- a/src/reduction_driver/CMakeLists.txt +++ b/src/reduction_driver/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/reduction_driver/device/CMakeLists.txt b/src/reduction_driver/device/CMakeLists.txt index 68f55c86..95d6575d 100644 --- a/src/reduction_driver/device/CMakeLists.txt +++ b/src/reduction_driver/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/reduction_driver/device/device_reduction_driver.cxx b/src/reduction_driver/device/device_reduction_driver.cxx index ea014857..2395e722 100644 --- a/src/reduction_driver/device/device_reduction_driver.cxx +++ b/src/reduction_driver/device/device_reduction_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/device/device_reduction_driver.hpp b/src/reduction_driver/device/device_reduction_driver.hpp index 5b63615e..b26dafe6 100644 --- a/src/reduction_driver/device/device_reduction_driver.hpp +++ b/src/reduction_driver/device/device_reduction_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/device/nccl_reduction_driver.cxx b/src/reduction_driver/device/nccl_reduction_driver.cxx index 20848f4a..7c314805 100644 --- a/src/reduction_driver/device/nccl_reduction_driver.cxx +++ b/src/reduction_driver/device/nccl_reduction_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/device/nccl_reduction_driver.hpp b/src/reduction_driver/device/nccl_reduction_driver.hpp index 9db4e40f..529c6c9a 100644 --- a/src/reduction_driver/device/nccl_reduction_driver.hpp +++ b/src/reduction_driver/device/nccl_reduction_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/host/CMakeLists.txt b/src/reduction_driver/host/CMakeLists.txt index 52729332..a3bc5fe8 100644 --- a/src/reduction_driver/host/CMakeLists.txt +++ b/src/reduction_driver/host/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx index a8de7975..904f7caf 100644 --- a/src/reduction_driver/host/basic_mpi_reduction_driver.cxx +++ b/src/reduction_driver/host/basic_mpi_reduction_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/host/basic_mpi_reduction_driver.hpp b/src/reduction_driver/host/basic_mpi_reduction_driver.hpp index 7c3b231f..8172edc8 100644 --- a/src/reduction_driver/host/basic_mpi_reduction_driver.hpp +++ b/src/reduction_driver/host/basic_mpi_reduction_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/host/host_reduction_driver.cxx b/src/reduction_driver/host/host_reduction_driver.cxx index f6accde8..fe288602 100644 --- a/src/reduction_driver/host/host_reduction_driver.cxx +++ b/src/reduction_driver/host/host_reduction_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/host/host_reduction_driver.hpp b/src/reduction_driver/host/host_reduction_driver.hpp index 382d62e2..fe661de6 100644 --- a/src/reduction_driver/host/host_reduction_driver.hpp +++ b/src/reduction_driver/host/host_reduction_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/reduction_driver.cxx b/src/reduction_driver/reduction_driver.cxx index bc09d6a2..26a57415 100644 --- a/src/reduction_driver/reduction_driver.cxx +++ b/src/reduction_driver/reduction_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/reduction_driver_factory.cxx b/src/reduction_driver/reduction_driver_factory.cxx index d11a492c..8b3d5f34 100644 --- a/src/reduction_driver/reduction_driver_factory.cxx +++ b/src/reduction_driver/reduction_driver_factory.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/reduction_driver_impl.cxx b/src/reduction_driver/reduction_driver_impl.cxx index 4f4e24c6..96351265 100644 --- a/src/reduction_driver/reduction_driver_impl.cxx +++ b/src/reduction_driver/reduction_driver_impl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/reduction_driver/reduction_driver_impl.hpp b/src/reduction_driver/reduction_driver_impl.hpp index d5f8bafb..e4980a9a 100644 --- a/src/reduction_driver/reduction_driver_impl.hpp +++ b/src/reduction_driver/reduction_driver_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/CMakeLists.txt b/src/runtime_environment/CMakeLists.txt index dd43e5c8..2ef10327 100644 --- a/src/runtime_environment/CMakeLists.txt +++ b/src/runtime_environment/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/runtime_environment/device/CMakeLists.txt b/src/runtime_environment/device/CMakeLists.txt index 360783db..151a5889 100644 --- a/src/runtime_environment/device/CMakeLists.txt +++ b/src/runtime_environment/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/runtime_environment/device/cuda/CMakeLists.txt b/src/runtime_environment/device/cuda/CMakeLists.txt index f397a2ca..50ea945c 100644 --- a/src/runtime_environment/device/cuda/CMakeLists.txt +++ b/src/runtime_environment/device/cuda/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/runtime_environment/device/cuda/cuda_backend.cxx b/src/runtime_environment/device/cuda/cuda_backend.cxx index 84bd457f..610f33f4 100644 --- a/src/runtime_environment/device/cuda/cuda_backend.cxx +++ b/src/runtime_environment/device/cuda/cuda_backend.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device/cuda/cuda_backend.hpp b/src/runtime_environment/device/cuda/cuda_backend.hpp index 6401803b..8e47a6a9 100644 --- a/src/runtime_environment/device/cuda/cuda_backend.hpp +++ b/src/runtime_environment/device/cuda/cuda_backend.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device/device_backend.hpp b/src/runtime_environment/device/device_backend.hpp index 5c10bf90..594b7988 100644 --- a/src/runtime_environment/device/device_backend.hpp +++ b/src/runtime_environment/device/device_backend.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device/device_blas_handle.hpp b/src/runtime_environment/device/device_blas_handle.hpp index e0faaea7..76368f34 100644 --- a/src/runtime_environment/device/device_blas_handle.hpp +++ b/src/runtime_environment/device/device_blas_handle.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device/device_queue.hpp b/src/runtime_environment/device/device_queue.hpp index 9fdbacff..51eba1c5 100644 --- a/src/runtime_environment/device/device_queue.hpp +++ b/src/runtime_environment/device/device_queue.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device/device_runtime_environment.cxx b/src/runtime_environment/device/device_runtime_environment.cxx index 41d841b8..88998bd9 100644 --- a/src/runtime_environment/device/device_runtime_environment.cxx +++ b/src/runtime_environment/device/device_runtime_environment.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -67,5 +71,11 @@ DeviceBackend* DeviceRuntimeEnvironment::device_backend() const { bool DeviceRuntimeEnvironment::owns_memory() const { return device_runtime_pimpl_cast(pimpl_.get())->owns_memory(); } +void DeviceRuntimeEnvironment::release_buffer() { + device_runtime_pimpl_cast(pimpl_.get())->release_buffer(); +} +void DeviceRuntimeEnvironment::set_buffer(void* p, size_t sz) { + device_runtime_pimpl_cast(pimpl_.get())->set_buffer(p, sz); +} } diff --git a/src/runtime_environment/device/device_runtime_environment_impl.hpp b/src/runtime_environment/device/device_runtime_environment_impl.hpp index 6489e11c..9831c5c2 100644 --- a/src/runtime_environment/device/device_runtime_environment_impl.hpp +++ b/src/runtime_environment/device/device_runtime_environment_impl.hpp @@ -1,13 +1,18 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #pragma once #include "../runtime_environment_impl.hpp" #include "device_backend.hpp" +#include namespace GauXC::detail { @@ -63,6 +68,23 @@ class DeviceRuntimeEnvironmentImpl : public RuntimeEnvironmentImpl { inline size_t device_memory_size() const { return device_memory_size_; } inline bool owns_memory() const { return i_own_this_memory_; } + inline void release_buffer() { + if(i_own_this_memory_ and device_memory_ and device_memory_size_) { + device_backend_->free_device_buffer(device_memory_); + } else { + GAUXC_GENERIC_EXCEPTION("GauXC Cannot Release A Buffer It Does Not Own"); + } + } + + inline void set_buffer(void* p, size_t sz) { + if(owns_memory()) { + release_buffer(); + i_own_this_memory_ = false; + } + + device_memory_ = p; + device_memory_size_ = sz; + } }; diff --git a/src/runtime_environment/device/hip/CMakeLists.txt b/src/runtime_environment/device/hip/CMakeLists.txt index df97901b..5fd50fc9 100644 --- a/src/runtime_environment/device/hip/CMakeLists.txt +++ b/src/runtime_environment/device/hip/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/runtime_environment/device/hip/hip_backend.cxx b/src/runtime_environment/device/hip/hip_backend.cxx index 48c6732a..69c3fd28 100644 --- a/src/runtime_environment/device/hip/hip_backend.cxx +++ b/src/runtime_environment/device/hip/hip_backend.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device/hip/hip_backend.hpp b/src/runtime_environment/device/hip/hip_backend.hpp index 38919bec..6b90063f 100644 --- a/src/runtime_environment/device/hip/hip_backend.hpp +++ b/src/runtime_environment/device/hip/hip_backend.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/cublas_util.hpp b/src/runtime_environment/device_specific/cublas_util.hpp index 1955740d..10fa35be 100644 --- a/src/runtime_environment/device_specific/cublas_util.hpp +++ b/src/runtime_environment/device_specific/cublas_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/cuda_device_constants.hpp b/src/runtime_environment/device_specific/cuda_device_constants.hpp index 53c86ac5..3b4ac8e4 100644 --- a/src/runtime_environment/device_specific/cuda_device_constants.hpp +++ b/src/runtime_environment/device_specific/cuda_device_constants.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/cuda_util.hpp b/src/runtime_environment/device_specific/cuda_util.hpp index d5c632ef..7d133e5d 100644 --- a/src/runtime_environment/device_specific/cuda_util.hpp +++ b/src/runtime_environment/device_specific/cuda_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/hip_device_constants.hpp b/src/runtime_environment/device_specific/hip_device_constants.hpp index 3a79fdf3..38ff3878 100644 --- a/src/runtime_environment/device_specific/hip_device_constants.hpp +++ b/src/runtime_environment/device_specific/hip_device_constants.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/hip_util.hpp b/src/runtime_environment/device_specific/hip_util.hpp index 797c5b8f..61ea9e0c 100644 --- a/src/runtime_environment/device_specific/hip_util.hpp +++ b/src/runtime_environment/device_specific/hip_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/hipblas_util.hpp b/src/runtime_environment/device_specific/hipblas_util.hpp index 38351ee1..d9b324df 100644 --- a/src/runtime_environment/device_specific/hipblas_util.hpp +++ b/src/runtime_environment/device_specific/hipblas_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/magma_util.hpp b/src/runtime_environment/device_specific/magma_util.hpp index 7cf79369..ca4f4f17 100644 --- a/src/runtime_environment/device_specific/magma_util.hpp +++ b/src/runtime_environment/device_specific/magma_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/device_specific/nccl_util.hpp b/src/runtime_environment/device_specific/nccl_util.hpp index 78938a0b..f4b87839 100644 --- a/src/runtime_environment/device_specific/nccl_util.hpp +++ b/src/runtime_environment/device_specific/nccl_util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/runtime_environment.cxx b/src/runtime_environment/runtime_environment.cxx index 83d9e365..21d3910e 100644 --- a/src/runtime_environment/runtime_environment.cxx +++ b/src/runtime_environment/runtime_environment.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/runtime_environment/runtime_environment_impl.hpp b/src/runtime_environment/runtime_environment_impl.hpp index a68129f8..6afa888f 100644 --- a/src/runtime_environment/runtime_environment_impl.hpp +++ b/src/runtime_environment/runtime_environment_impl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/CMakeLists.txt b/src/xc_integrator/CMakeLists.txt index c8adc6ad..9bc36d25 100644 --- a/src/xc_integrator/CMakeLists.txt +++ b/src/xc_integrator/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/integrator_util/CMakeLists.txt b/src/xc_integrator/integrator_util/CMakeLists.txt index 92b91a52..0a0edbe8 100644 --- a/src/xc_integrator/integrator_util/CMakeLists.txt +++ b/src/xc_integrator/integrator_util/CMakeLists.txt @@ -1,8 +1,12 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # -target_sources( gauxc PRIVATE integrator_common.cxx integral_bounds.cxx exx_screening.cxx ) +target_sources( gauxc PRIVATE integrator_common.cxx integral_bounds.cxx exx_screening.cxx spherical_harmonics.cxx ) diff --git a/src/xc_integrator/integrator_util/exx_screening.cxx b/src/xc_integrator/integrator_util/exx_screening.cxx index ccd716eb..5c7efcd1 100644 --- a/src/xc_integrator/integrator_util/exx_screening.cxx +++ b/src/xc_integrator/integrator_util/exx_screening.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/integrator_util/exx_screening.hpp b/src/xc_integrator/integrator_util/exx_screening.hpp index 176301ca..5c55c3a4 100644 --- a/src/xc_integrator/integrator_util/exx_screening.hpp +++ b/src/xc_integrator/integrator_util/exx_screening.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/integrator_util/integral_bounds.cxx b/src/xc_integrator/integrator_util/integral_bounds.cxx index e78adb05..680c3538 100644 --- a/src/xc_integrator/integrator_util/integral_bounds.cxx +++ b/src/xc_integrator/integrator_util/integral_bounds.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/integrator_util/integral_bounds.hpp b/src/xc_integrator/integrator_util/integral_bounds.hpp index 1f1e055a..02c6cae5 100644 --- a/src/xc_integrator/integrator_util/integral_bounds.hpp +++ b/src/xc_integrator/integrator_util/integral_bounds.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/integrator_util/integrator_common.cxx b/src/xc_integrator/integrator_util/integrator_common.cxx index b2f2a359..e919d917 100644 --- a/src/xc_integrator/integrator_util/integrator_common.cxx +++ b/src/xc_integrator/integrator_util/integrator_common.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/integrator_util/integrator_common.hpp b/src/xc_integrator/integrator_util/integrator_common.hpp index 67e4bab9..079e3d60 100644 --- a/src/xc_integrator/integrator_util/integrator_common.hpp +++ b/src/xc_integrator/integrator_util/integrator_common.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/integrator_util/spherical_harmonics.cxx b/src/xc_integrator/integrator_util/spherical_harmonics.cxx new file mode 100644 index 00000000..be7cf84a --- /dev/null +++ b/src/xc_integrator/integrator_util/spherical_harmonics.cxx @@ -0,0 +1,166 @@ +#include "spherical_harmonics.hpp" +// Computes the normalization constants N(l,m) for spherical harmonics up to degree lmax +// N(l,m) = sqrt((2l + 1) / (4π) * ( (l - m)! / (l + m)! ) ) +// for m = 0, N(l,0) = sqrt(4π / (2l + 1)) +// for m > 0, N(l,m) = -N(l,m-1) / sqrt((l - m + 1) * (l + m)) +std::vector sph_nlm(const int lmax) { + std::vector nlm((lmax + 1) * (lmax + 1), 0.0); + for (int l = 0; l <= lmax; ++l) { + // For m = 0 + int ind = l*l+l; + double tmp = std::sqrt( 4.0 * M_PI / (2 * l + 1) ); + nlm[ind] = 1 / tmp; + // For m != 0 + tmp = nlm[ind] * std::sqrt(2.0); + for (int m = 1; m <= l; ++m) { + tmp = -tmp / std::sqrt(static_cast((l - m + 1) * (l + m))); + nlm[ind + m ] = tmp; + } + } + return nlm; +} + +// Computes associated Legendre polynomials P_l^m(cos(theta)) up to degree lmax +// // Input: +// // - cos_theta: cos(theta), where -1 <= cos_theta <= 1 +// // - sin_theta: sin(theta), where 0 <= sin_theta <= 1 +// // - lmax: maximum degree of the polynomials to compute, lmax >= 0 +// // Output: +// // - Returns a vector with values of associated Legendre polynomials, flattened to 1D with size (lmax+1)*(lmax+1) +std::vector sph_plm (const double cos_theta, const double sin_theta, const int lmax) { + std::vector plms((lmax + 1) * (lmax + 1), 0.0); + + // Base cases + plms[0] = 1.0; // P_0^0 = 1 + if (lmax == 0) return plms; + + plms[2] = cos_theta; // P_1^0 = cos(theta) + plms[3] = -sin_theta; // P_1^1 = -sin(theta) + if (lmax == 1) return plms; + + double cos_theta2 = cos_theta * cos_theta; + plms[6] = 1.5 * cos_theta2 - 0.5; // P_2^0 (cos(theta)) = 1.5 * cos^2(theta) - 0.5, idx = 2*2 + 2 + 0 = 6 + plms[7] = -3 * sin_theta * cos_theta; // P_2^1 (cos(theta)) = -3 * sin(theta) * cos(theta) + plms[8] = 3 * sin_theta * sin_theta; // P_2^2 (cos(theta)) = -3 * sin^2(theta) + if (lmax == 2) return plms; + + plms[12] = 2.5 * cos_theta2 * cos_theta - 1.5 * cos_theta; // P_3^0 (cos(theta)) = 2.5 * cos^3(theta) - 1.5 * cos(theta) + plms[13] = -7.5 * cos_theta2 * sin_theta + 1.5 * sin_theta ; // P_3^1 (cos(theta)) = -7.5 * cos^2(theta) * sin(theta) + 1.5 * sin(theta) + plms[14] = -5.0 * sin_theta * plms[7]; // P_3^2 (cos(theta)) = -5.0 * sin(theta) * P_2^1 (cos(theta)) + plms[15] = -5.0 * sin_theta * plms[8]; // P_3^3 (cos(theta)) = -5.0 * sin(theta) * P_2^2 (cos(theta)) + if (lmax == 3) return plms; + // Recurrence calculation for larger p + for (int l = 4; l <= lmax; ++l) { + double work = (2.0 * l - 1) * cos_theta; + for (int m = 0; m < l; ++m) { + int ind = l * l + l + m; + int pl1m_ind = (l - 1) * (l - 1) + l - 1 + m; + int pl2m_ind = (l - 2) * (l - 2) + l - 2 + m; + plms[ind] = (work * plms[pl1m_ind] - (l + m - 1) * plms[pl2m_ind]) / (l - m); + } + // Special case for m = l, P_m^m = -sin_theta * (2*m+1) * P_{m-1}^{m-1} + plms[(l+1)*(l+1) - 1] = -sin_theta * (2 * (l - 1) + 1) * plms[l*l-1]; + } + return plms; +} + +// Computes spherical harmonics Y_l^m(theta, phi) = N(l,m) P_l^m(cos(theta)) e^(imphi) +// up to degree lmax at point x, with scaling factors nlm +// - Returns a vector with size (lmax+1)*(lmax+1) +void sph_legendre(const int lmax, const std::array x, const std::vector& nlm, double* ylms) { + assert(x.size() == 3); + double rho = sqrt(x[0] * x[0] + x[1] * x[1] + x[2] * x[2]); + if (rho == 0.0) { + return; + } + double sin_theta = sqrt(x[0] * x[0] + x[1] * x[1]) / rho; // sin(theta) = r_xy/rho + if (sin_theta != 0.0) { + double cos_theta = x[2] / rho; + std::vector plm = sph_plm(cos_theta, sin_theta, lmax); + for (int l = 0; l <= lmax; l++) { + int ind = l * l + l; + ylms[ind] = plm[ind] * nlm[ind]; // m = 0 implicitly uses `vcos(1) = 1` + for (int m = 1; m <= l; ++m) { + ylms[ind + m] = plm[ind + m] * nlm[ind + m]; + ylms[ind - m] = ylms[ind + m]; + } + } + } else { + // x = 0, y = 0, z != 0 + double cos_theta = (x[2] > 0.0) ? 1.0 : -1.0; + for (int l = 0; l <= lmax; l ++) { + int ind = l * l + l; + ylms[ind] = nlm[ind]; + if (l % 2 != 0) { + ylms[ind] *= cos_theta; + } + } + } +} + +// compute scaled spherical harmonics, with precomputed normalization factors +// 4π |x - a|^l +// ------ ----------- Y_l^m(|x - a|) +// 2l + 1 r^l +void scaled_ylm_new(const int lmax, const std::array x, const std::array a, const double r, const std::vector& nlm, double* ylm) { + std::array delta = {x[0] - a[0], x[1] - a[1], x[2] - a[2]}; + double dnorm = sqrt(delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2]); + assert(dnorm != 0.0); + std::array delta_norm = {delta[0] / dnorm, delta[1] / dnorm, delta[2] / dnorm}; + double phi = atan2(delta_norm[1], delta_norm[0]); + sph_legendre(lmax, delta_norm, nlm, ylm); + for (int l = 0; l <= lmax; l++) { + double ratio = pow(dnorm / r, l) * 4.0 * M_PI / (2 * l + 1); + for (int m = -l; m <= l; m++) { + int ind = l * l + l + m; + if (m == 0) { + ylm[ind] *= ratio; + } else if (m < 0) { + ylm[ind] *= - ratio * sin(m * phi); + } else { + ylm[ind] *= ratio * cos(m * phi); + } + } + } +} + +// compute scaled spherical harmonics, with standard library functions +std::vector scaled_ylm_std(int lmax, std::array x, std::array a, double r) { + + std::vector delta = {x[0] - a[0], x[1] - a[1], x[2] - a[2]}; + double dnorm = sqrt(delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2]); + assert(dnorm != 0.0); + std::vector delta_norm = {delta[0] / dnorm, delta[1] / dnorm, delta[2] / dnorm}; + + double rho = sqrt(delta_norm[0] * delta_norm[0] + delta_norm[1] * delta_norm[1] + delta_norm[2] * delta_norm[2]); + double theta = acos(delta_norm[2] / rho); + double phi = atan2(delta_norm[1], delta_norm[0]); + + std::vector ylm((lmax + 1) * (lmax + 1), 0.0); + for (int l = 0; l <= lmax; l++) { + double ratio = pow(dnorm / r, l) * 4.0 * M_PI / (2 * l + 1); + for (int m = 0; m <= l; m++) { + double sph = std::sph_legendre(l, m, theta) * ratio; + if (m == 0) { + ylm[l * l + l] = sph; + } else { + if (m % 2 != 0) { + sph *= -1; + } + sph *= sqrt(2.0); + ylm[l * l + l - m ] = sph * sin(m * phi); + ylm[l * l + l + m ] = sph * cos(m * phi); + } + } + } + return ylm; +} + +void scaled_ylm_matrix(const int lmax, const double* points, const int32_t npts, const std::array center, const double radius, double* ylm_matrix) { + int nharmonics = (lmax + 1) * (lmax + 1); + auto nlm = sph_nlm(lmax); + for (int i = 0; i < npts; ++i) { + const std::array x = {points[3 * i], points[3 * i + 1], points[3 * i + 2]}; + scaled_ylm_new(lmax, x, center, radius, nlm, ylm_matrix + i * nharmonics); + } +} \ No newline at end of file diff --git a/src/xc_integrator/integrator_util/spherical_harmonics.hpp b/src/xc_integrator/integrator_util/spherical_harmonics.hpp new file mode 100644 index 00000000..7ce495d8 --- /dev/null +++ b/src/xc_integrator/integrator_util/spherical_harmonics.hpp @@ -0,0 +1,7 @@ +#include +#include +#include +#include + + +void scaled_ylm_matrix(const int lmax, const double* points, const int32_t npts, const std::array center, const double radius, double* ylm_matrix); \ No newline at end of file diff --git a/src/xc_integrator/local_work_driver/CMakeLists.txt b/src/xc_integrator/local_work_driver/CMakeLists.txt index 4705cfab..e0e5385f 100644 --- a/src/xc_integrator/local_work_driver/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/common/integrator_constants.hpp b/src/xc_integrator/local_work_driver/common/integrator_constants.hpp index c86e2438..6229db7d 100644 --- a/src/xc_integrator/local_work_driver/common/integrator_constants.hpp +++ b/src/xc_integrator/local_work_driver/common/integrator_constants.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -13,7 +17,7 @@ namespace integrator { template constexpr F magic_ssf_factor = 0.64; -constexpr double ssf_weight_tol = 1e-10; +constexpr double ssf_weight_tol = 1e-13; } } diff --git a/src/xc_integrator/local_work_driver/device/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/CMakeLists.txt index 3c0fd8b6..0911dff2 100644 --- a/src/xc_integrator/local_work_driver/device/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp b/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp index 92144ad0..ef705d0c 100644 --- a/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp +++ b/src/xc_integrator/local_work_driver/device/common/collocation_device.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -95,4 +99,10 @@ void eval_collocation_shell_to_task_laplacian( XCDeviceTask* device_tasks, device_queue queue ); +void eval_collocation_shell_to_task_lapgrad( + uint32_t max_l, + AngularMomentumShellToTaskBatch* l_batched_shell_to_task, + XCDeviceTask* device_tasks, + device_queue queue ); + } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp index e590bb90..dc1f0d8f 100644 --- a/src/xc_integrator/local_work_driver/device/common/device_blas.hpp +++ b/src/xc_integrator/local_work_driver/device/common/device_blas.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -20,6 +24,10 @@ enum class DeviceBlasUplo : unsigned char { Lower }; +template +void increment( device_blas_handle generic_handle, const T* X, T* Y, int N ); + + template void dot( device_blas_handle handle, int N, diff --git a/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp b/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp index 05e925b6..cb069a70 100644 --- a/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp +++ b/src/xc_integrator/local_work_driver/device/common/exx_ek_screening.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp b/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp index e278e23c..c7c84b3a 100644 --- a/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp +++ b/src/xc_integrator/local_work_driver/device/common/inc_potential.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp b/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp index be924bb8..7f78ebc8 100644 --- a/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp +++ b/src/xc_integrator/local_work_driver/device/common/increment_exc_grad.hpp @@ -1,22 +1,29 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #pragma once #include #include "device/xc_device_task.hpp" +#include "device/xc_device_data.hpp" #include "device/device_queue.hpp" #include "shell_to_task.hpp" namespace GauXC { -void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task, - XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue ); -void increment_exc_grad_gga( size_t nshell, ShellToTaskDevice* shell_to_task, - XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue ); +void increment_exc_grad_lda( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task, + XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue ); +void increment_exc_grad_gga( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task, + XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue ); +void increment_exc_grad_mgga( integrator_ks_scheme ks_scheme, size_t nshell, bool need_lapl, ShellToTaskDevice* shell_to_task, + XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue ); } diff --git a/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp b/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp index 9a978f55..8664c4b2 100644 --- a/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp +++ b/src/xc_integrator/local_work_driver/device/common/pack_submat.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp b/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp index 8902da75..28517b0b 100644 --- a/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp +++ b/src/xc_integrator/local_work_driver/device/common/shell_pair_to_task.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp b/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp index 63020fee..38a0fb03 100644 --- a/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp +++ b/src/xc_integrator/local_work_driver/device/common/shell_to_task.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp b/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp index 1e06e5fc..c26059f3 100644 --- a/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp +++ b/src/xc_integrator/local_work_driver/device/common/symmetrize_mat.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp index 78ce5324..25057228 100644 --- a/src/xc_integrator/local_work_driver/device/common/uvvars.hpp +++ b/src/xc_integrator/local_work_driver/device/common/uvvars.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,34 @@ namespace GauXC { void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, XCDeviceTask* device_tasks, device_queue queue ); - void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, XCDeviceTask* device_tasks, device_queue queue ); +void eval_uvars_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ); + + +void eval_vvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ); +void eval_vvars_gga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ); +void eval_vvars_mgga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ); + + + +void eval_tmat_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ); +void eval_tmat_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ); +void eval_tmat_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ); -void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, - device_queue queue ); -void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select, +void eval_vvars_lda_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ); +void eval_vvars_gga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, XCDeviceTask* device_tasks, device_queue queue ); +void eval_vvars_mgga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ); } diff --git a/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp b/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp index 99f1c99a..18b189f3 100644 --- a/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp +++ b/src/xc_integrator/local_work_driver/device/common/xc_functional_eval_wrapper.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -21,4 +25,16 @@ void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts, double* eps, double* vrho, double* vgamma, double* vtau, double* vlapl, device_queue queue ); +void eval_kern_vxc_fxc_lda( const functional_type& func, size_t npts, + const double* rho, double* vrho, double* v2rho2, device_queue queue ); +void eval_kern_vxc_fxc_gga( const functional_type& func, size_t npts, + const double* rho, const double* gamma, double* vrho, double* vgamma, + double* v2rho2, double* v2rhogamma, double* v2gamma2, device_queue queue ); +void eval_kern_vxc_fxc_mgga( const functional_type& func, size_t npts, + const double* rho, const double* gamma, const double* lapl, const double* tau, + double* vrho, double* vgamma, double* vlapl, double* vtau, + double* v2rho2, double* v2rhogamma, double* v2rholapl, double* v2rhotau, + double* v2gamma2, double* v2gammalapl, double* v2gammatau, double* v2lapl2, + double* v2lapltau, double* v2tau2, device_queue queue ); + } diff --git a/src/xc_integrator/local_work_driver/device/common/zmat_fxc.hpp b/src/xc_integrator/local_work_driver/device/common/zmat_fxc.hpp new file mode 100644 index 00000000..739afc81 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/common/zmat_fxc.hpp @@ -0,0 +1,47 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include "device/xc_device_task.hpp" +#include "device/xc_device_data.hpp" +#include "device/device_queue.hpp" + +namespace GauXC { + +void zmat_lda_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + density_id sel, + device_queue queue ); + +void zmat_gga_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + density_id sel, + device_queue queue ); + +void zmat_mgga_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + density_id sel, + device_queue queue ); + +void mmat_mgga_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + density_id sel, + device_queue queue ); +} diff --git a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp index 514b6d38..3c48967f 100644 --- a/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp +++ b/src/xc_integrator/local_work_driver/device/common/zmat_vxc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -32,6 +36,8 @@ void zmat_mgga_vxc( size_t ntasks, int32_t max_npts, XCDeviceTask* tasks_device, bool do_lapl, + integrator_ks_scheme s, + density_id sel, device_queue queue ); void mmat_mgga_vxc( size_t ntasks, @@ -39,5 +45,7 @@ void mmat_mgga_vxc( size_t ntasks, int32_t max_npts, XCDeviceTask* tasks_device, bool do_lapl, + integrator_ks_scheme s, + density_id sel, device_queue queue ); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt index 7320d3c1..8c0608f7 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/device/cuda/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # @@ -20,6 +24,7 @@ target_sources(gauxc PRIVATE kernels/cublas_extensions.cu kernels/uvvars.cu kernels/zmat_vxc.cu + kernels/zmat_fxc.cu kernels/cuda_inc_potential.cu kernels/symmetrize_mat.cu kernels/increment_exc_grad.cu diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx index fc081817..4e01b373 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.cxx @@ -1,13 +1,18 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #include "cuda_aos_scheme1.hpp" #include "device/cuda/cuda_backend.hpp" #include "cuda_aos_scheme1_weights.hpp" +#include "device/common/device_blas.hpp" namespace GauXC { @@ -37,6 +42,41 @@ void CudaAoSScheme1::partition_weights( XCDeviceData* _data ) { scheme1_stack.dist_nearest_device, base_stack.weights_device, *device_backend->master_stream ); } +template +void CudaAoSScheme1::eval_weight_1st_deriv_contracted( XCDeviceData* _data, XCWeightAlg alg ) { + if( alg != XCWeightAlg::SSF ) { + GAUXC_GENERIC_EXCEPTION("Weight Algorithm NYI for CUDA AoS Scheme1"); + } + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + auto device_backend = dynamic_cast(data->device_backend_); + if( !device_backend ) GAUXC_BAD_BACKEND_CAST(); + + // make w times f vector + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + auto base_stack = data->base_stack; + if( is_pol ) + increment( data->device_backend_->master_blas_handle(), base_stack.den_z_eval_device, + base_stack.den_s_eval_device, data->total_npts_task_batch ); + + hadamard_product(data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, base_stack.den_s_eval_device, 1, + base_stack.eps_eval_device, 1); + + + // Compute distances from grid to atomic centers + const auto ldatoms = data->get_ldatoms(); + auto static_stack = data->static_stack; + auto scheme1_stack = data->scheme1_stack; + cuda_aos_scheme1_weight_1st_deriv_wrapper( data->total_npts_task_batch, data->global_dims.natoms, + base_stack.points_x_device, base_stack.points_y_device, base_stack.points_z_device, + static_stack.rab_device, ldatoms, static_stack.coords_device, + scheme1_stack.dist_scratch_device, ldatoms, scheme1_stack.iparent_device, + scheme1_stack.dist_nearest_device, base_stack.eps_eval_device, static_stack.exc_grad_device, *device_backend->master_stream ); +} + template struct CudaAoSScheme1; #ifdef GAUXC_HAS_MAGMA diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp index aa431f72..cba52e14 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -33,6 +37,7 @@ struct CudaAoSScheme1 : public Base { // API Overrides void partition_weights( XCDeviceData* ) override final; + void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg ) override final; std::unique_ptr create_device_data(const DeviceRuntimeEnvironment&) override final; diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx index 8640fffe..9da703a9 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_data.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu index 7e74225c..deeba830 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -33,7 +37,7 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms, compute_grid_to_center_dist( npts, natoms, coords, points_x, points_y, points_z, dist, lddist, stream ); -#if 1 +#if 0 // Get the number of SM's on the device int num_sm; int dev_id = 0; @@ -49,11 +53,32 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms, weights ); #else - partition_weights_ssf_1d( npts, natoms, RAB, natoms, coords, dist, lddist, + partition_weights_ssf_1d( npts, natoms, RAB, ldRAB, coords, dist, lddist, iparent, dist_nearest, weights, stream ); #endif +} + + +void cuda_aos_scheme1_weight_1st_deriv_wrapper( + int32_t npts, int32_t natoms, + const double* points_x, const double* points_y, const double* points_z, + const double* RAB, int32_t ldRAB, const double* coords, + double* dist, int32_t lddist, const int32_t* iparent, + const double* dist_nearest, const double* w_times_f, + double* exc_grad_w, cudaStream_t stream ){ + + // Compute distances from grid to atomic centers + compute_grid_to_center_dist( npts, natoms, coords, points_x, points_y, points_z, + dist, lddist, stream ); + + eval_weight_1st_deriv_contracted_ssf_1d( npts, natoms, RAB, ldRAB, coords, points_x, points_y, points_z, dist, lddist, + iparent, dist_nearest, w_times_f, exc_grad_w, stream ); } + + + + } diff --git a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp index 74674587..affd940f 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/cuda_aos_scheme1_weights.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -13,4 +17,12 @@ void cuda_aos_scheme1_weights_wrapper( int32_t npts, int32_t natoms, double* dist, int32_t lddist, const int32_t* iparent, const double* dist_nearest, double* weights, cudaStream_t stream ); +void cuda_aos_scheme1_weight_1st_deriv_wrapper( + int32_t npts, int32_t natoms, + const double* points_x, const double* points_y, const double* points_z, + const double* RAB, int32_t ldRAB, const double* coords, + double* dist, int32_t lddist, const int32_t* iparent, + const double* dist_nearest, const double* w_times_f, + double* exc_grad_w, cudaStream_t stream ); + } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp index 08015c07..2ef18899 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_cartesian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp index f948c4fb..71b17b60 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_angular_spherical_unnorm.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp index b6f4ad63..216a6326 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_device_constants.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp index 39062a03..81968490 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_radial.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp index f8a97e8c..17b201a4 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -105,6 +109,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp index 9c6dcbfe..627a7936 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,7 +102,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -119,6 +123,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp index f6b3e63d..7543270b 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_0( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_0( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,7 +111,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = radial_eval_alpha_squared*(x*x); + const auto x1 = radial_eval_alpha_squared*x; + const auto x2 = radial_eval_alpha_squared*(y*y); + const auto x3 = radial_eval_alpha_squared*(z*z); + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -125,22 +133,24 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*x*x; + basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x0; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y; + basis_xy_eval[ipt + 0*npts] = x1*y; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*z; + basis_xz_eval[ipt + 0*npts] = x1*z; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*y*y; + basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x2; // Evaluate second derivative of bfn wrt yz basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*y*z; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*z*z; + basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x3; + + diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp new file mode 100644 index 00000000..d12623d1 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp @@ -0,0 +1,208 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_0( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = radial_eval_alpha_squared*x0; + const auto x2 = radial_eval_alpha_squared*x; + const auto x3 = y*y; + const auto x4 = radial_eval_alpha_squared*x3; + const auto x5 = radial_eval_alpha_squared*y; + const auto x6 = z*z; + const auto x7 = radial_eval_alpha_squared*x6; + const auto x8 = radial_eval_alpha_cubed*x; + const auto x9 = radial_eval_alpha_cubed*y; + const auto x10 = radial_eval_alpha_cubed*z; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval_alpha*y; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x1; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x2*y; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x2*z; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x4; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x5*z; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x7; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x1 + x4 + x7; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(x*x*x) + 5.0*x2 + x3*x8 + x6*x8; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(y*y*y) + x0*x9 + 5.0*x5 + x6*x9; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(z*z*z) + 5.0*radial_eval_alpha_squared*z + x0*x10 + x10*x3; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + + + ang_eval_0 = radial_eval; + basis_eval[ipt + 0*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + + dang_eval_x_0 = radial_eval_alpha*x; + dang_eval_y_0 = radial_eval_alpha*y; + dang_eval_z_0 = radial_eval_alpha*z; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp index 06bfc86a..6fe1ec2d 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_0( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_0( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,7 +106,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = radial_eval_alpha_squared*(x*x); + const auto x1 = radial_eval_alpha_squared*x; + const auto x2 = radial_eval_alpha_squared*(y*y); + const auto x3 = radial_eval_alpha_squared*(z*z); + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -119,8 +127,10 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt z basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = 3*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z; + basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x0 + x2 + x3; + diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp index db904d1c..b2b4672d 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*x; @@ -107,6 +111,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp index 6838f2fa..2aaabc26 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_1( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_1( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,7 +102,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = radial_eval_alpha*x; + const auto x1 = x0*y; + const auto x2 = x0*z; + const auto x3 = radial_eval_alpha*y*z; + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*x; @@ -109,19 +117,21 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x*x; - basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*y; - basis_x_eval[ipt + 2*npts] = radial_eval_alpha*x*z; + basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*(x*x); + basis_x_eval[ipt + 1*npts] = x1; + basis_x_eval[ipt + 2*npts] = x2; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*y; - basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*y*y; - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*y*z; + basis_y_eval[ipt + 0*npts] = x1; + basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*(y*y); + basis_y_eval[ipt + 2*npts] = x3; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*y*z; - basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*z*z; + basis_z_eval[ipt + 0*npts] = x2; + basis_z_eval[ipt + 1*npts] = x3; + basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*(z*z); + + @@ -149,15 +159,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; - dang_eval_x_0 = radial_eval + radial_eval_alpha*x*x; - dang_eval_y_0 = radial_eval_alpha*x*y; - dang_eval_z_0 = radial_eval_alpha*x*z; - dang_eval_x_1 = radial_eval_alpha*x*y; - dang_eval_y_1 = radial_eval + radial_eval_alpha*y*y; - dang_eval_z_1 = radial_eval_alpha*y*z; - dang_eval_x_2 = radial_eval_alpha*x*z; - dang_eval_y_2 = radial_eval_alpha*y*z; - dang_eval_z_2 = radial_eval + radial_eval_alpha*z*z; + dang_eval_x_0 = radial_eval + radial_eval_alpha*(x*x); + dang_eval_y_0 = x1; + dang_eval_z_0 = x2; + dang_eval_x_1 = x1; + dang_eval_y_1 = radial_eval + radial_eval_alpha*(y*y); + dang_eval_z_1 = x3; + dang_eval_x_2 = x2; + dang_eval_y_2 = x3; + dang_eval_z_2 = radial_eval + radial_eval_alpha*(z*z); basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp index 26f5bc7c..2047493d 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_1( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_1( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,7 +111,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = radial_eval_alpha*x; + const auto x2 = x1*y; + const auto x3 = x1*z; + const auto x4 = y*y; + const auto x5 = y*z; + const auto x6 = radial_eval_alpha*x5; + const auto x7 = z*z; + const auto x8 = 3.0*radial_eval_alpha; + const auto x9 = radial_eval_alpha_squared*x0; + const auto x10 = radial_eval_alpha + x9; + const auto x11 = x10*y; + const auto x12 = x10*z; + const auto x13 = radial_eval_alpha_squared*x4; + const auto x14 = radial_eval_alpha + x13; + const auto x15 = x*x14; + const auto x16 = radial_eval_alpha_squared*x*x5; + const auto x17 = radial_eval_alpha_squared*x7; + const auto x18 = radial_eval_alpha + x17; + const auto x19 = x*x18; + const auto x20 = x14*z; + const auto x21 = x18*y; + const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x9; + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*x; @@ -118,49 +145,51 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x*x; - basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*y; - basis_x_eval[ipt + 2*npts] = radial_eval_alpha*x*z; + basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x0; + basis_x_eval[ipt + 1*npts] = x2; + basis_x_eval[ipt + 2*npts] = x3; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*y; - basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*y*y; - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*y*z; + basis_y_eval[ipt + 0*npts] = x2; + basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x4; + basis_y_eval[ipt + 2*npts] = x6; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*y*z; - basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*z*z; + basis_z_eval[ipt + 0*npts] = x3; + basis_z_eval[ipt + 1*npts] = x6; + basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x7; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = x*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 1*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 0*npts] = x*(x8 + x9); + basis_xx_eval[ipt + 1*npts] = x11; + basis_xx_eval[ipt + 2*npts] = x12; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 1*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 2*npts] = radial_eval_alpha_squared*x*y*z; + basis_xy_eval[ipt + 0*npts] = x11; + basis_xy_eval[ipt + 1*npts] = x15; + basis_xy_eval[ipt + 2*npts] = x16; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 1*npts] = radial_eval_alpha_squared*x*y*z; - basis_xz_eval[ipt + 2*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 0*npts] = x12; + basis_xz_eval[ipt + 1*npts] = x16; + basis_xz_eval[ipt + 2*npts] = x19; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 1*npts] = y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 0*npts] = x15; + basis_yy_eval[ipt + 1*npts] = y*(x13 + x8); + basis_yy_eval[ipt + 2*npts] = x20; // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y*z; - basis_yz_eval[ipt + 1*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 2*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 0*npts] = x16; + basis_yz_eval[ipt + 1*npts] = x20; + basis_yz_eval[ipt + 2*npts] = x21; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 1*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 0*npts] = x19; + basis_zz_eval[ipt + 1*npts] = x21; + basis_zz_eval[ipt + 2*npts] = z*(x17 + x8); + + @@ -187,15 +216,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; - dang_eval_x_0 = radial_eval + radial_eval_alpha*x*x; - dang_eval_y_0 = radial_eval_alpha*x*y; - dang_eval_z_0 = radial_eval_alpha*x*z; - dang_eval_x_1 = radial_eval_alpha*x*y; - dang_eval_y_1 = radial_eval + radial_eval_alpha*y*y; - dang_eval_z_1 = radial_eval_alpha*y*z; - dang_eval_x_2 = radial_eval_alpha*x*z; - dang_eval_y_2 = radial_eval_alpha*y*z; - dang_eval_z_2 = radial_eval + radial_eval_alpha*z*z; + dang_eval_x_0 = radial_eval + radial_eval_alpha*x0; + dang_eval_y_0 = x2; + dang_eval_z_0 = x3; + dang_eval_x_1 = x2; + dang_eval_y_1 = radial_eval + radial_eval_alpha*x4; + dang_eval_z_1 = x6; + dang_eval_x_2 = x3; + dang_eval_y_2 = x6; + dang_eval_z_2 = radial_eval + radial_eval_alpha*x7; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp new file mode 100644 index 00000000..9e6ea4c5 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp @@ -0,0 +1,285 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_1( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = x0; + const auto x2 = radial_eval_alpha*x; + const auto x3 = x2*y; + const auto x4 = x2*z; + const auto x5 = y*y; + const auto x6 = x5; + const auto x7 = y*z; + const auto x8 = radial_eval_alpha*x7; + const auto x9 = z*z; + const auto x10 = x9; + const auto x11 = 3.0*radial_eval_alpha; + const auto x12 = radial_eval_alpha_squared*x1; + const auto x13 = radial_eval_alpha + x12; + const auto x14 = x13*y; + const auto x15 = x13*z; + const auto x16 = radial_eval_alpha_squared*x6; + const auto x17 = radial_eval_alpha + x16; + const auto x18 = x*x17; + const auto x19 = radial_eval_alpha_squared*x*x7; + const auto x20 = radial_eval_alpha_squared*x10; + const auto x21 = radial_eval_alpha + x20; + const auto x22 = x*x21; + const auto x23 = x17*z; + const auto x24 = x21*y; + const auto x25 = 5.0*radial_eval_alpha; + const auto x26 = x16 + x20 + x25; + const auto x27 = x12 + x26; + const auto x28 = 3.0*radial_eval_alpha_squared; + const auto x29 = radial_eval_alpha_cubed*(x*x*x); + const auto x30 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; + const auto x31 = radial_eval_alpha_cubed*x10 + radial_eval_alpha_squared; + const auto x32 = 5.0*radial_eval_alpha_squared; + const auto x33 = x*x30 + x*x31 + x*x32 + x29; + const auto x34 = radial_eval_alpha_cubed*(y*y*y); + const auto x35 = radial_eval_alpha_cubed*x1 + radial_eval_alpha_squared; + const auto x36 = x31*y + x32*y + x34 + x35*y; + const auto x37 = x12 + x25; + const auto x38 = radial_eval_alpha_cubed*(z*z*z); + const auto x39 = x30*z + x32*z + x35*z + x38; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x; + basis_eval[ipt + 1*npts] = radial_eval*y; + basis_eval[ipt + 2*npts] = radial_eval*z; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x1; + basis_x_eval[ipt + 1*npts] = x3; + basis_x_eval[ipt + 2*npts] = x4; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x3; + basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x6; + basis_y_eval[ipt + 2*npts] = x8; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x4; + basis_z_eval[ipt + 1*npts] = x8; + basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x10; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x*(x11 + x12); + basis_xx_eval[ipt + 1*npts] = x14; + basis_xx_eval[ipt + 2*npts] = x15; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x14; + basis_xy_eval[ipt + 1*npts] = x18; + basis_xy_eval[ipt + 2*npts] = x19; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x15; + basis_xz_eval[ipt + 1*npts] = x19; + basis_xz_eval[ipt + 2*npts] = x22; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x18; + basis_yy_eval[ipt + 1*npts] = y*(x11 + x16); + basis_yy_eval[ipt + 2*npts] = x23; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x19; + basis_yz_eval[ipt + 1*npts] = x23; + basis_yz_eval[ipt + 2*npts] = x24; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x22; + basis_zz_eval[ipt + 1*npts] = x24; + basis_zz_eval[ipt + 2*npts] = z*(x11 + x20); + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x*x27; + basis_lapl_eval[ipt + 1*npts] = x27*y; + basis_lapl_eval[ipt + 2*npts] = x27*z; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x*(x*x28 + x29) + x0*x30 + x0*x31 + x1*x28 + x26; + basis_lapl_x_eval[ipt + 1*npts] = x33*y; + basis_lapl_x_eval[ipt + 2*npts] = x33*z; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x*x36; + basis_lapl_y_eval[ipt + 1*npts] = x20 + x28*x6 + x31*x5 + x35*x5 + x37 + y*(x28*y + x34); + basis_lapl_y_eval[ipt + 2*npts] = x36*z; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x*x39; + basis_lapl_z_eval[ipt + 1*npts] = x39*y; + basis_lapl_z_eval[ipt + 2*npts] = x10*x28 + x16 + x30*x9 + x35*x9 + x37 + z*(x28*z + x38); + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + + + ang_eval_0 = radial_eval*x; + ang_eval_1 = radial_eval*y; + ang_eval_2 = radial_eval*z; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + + dang_eval_x_0 = radial_eval + radial_eval_alpha*x1; + dang_eval_y_0 = x3; + dang_eval_z_0 = x4; + dang_eval_x_1 = x3; + dang_eval_y_1 = radial_eval + radial_eval_alpha*x6; + dang_eval_z_1 = x8; + dang_eval_x_2 = x4; + dang_eval_y_2 = x8; + dang_eval_z_2 = radial_eval + radial_eval_alpha*x10; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp index aa5cb4de..ae4d6cc5 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_1( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_1( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,7 +106,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = radial_eval_alpha*x; + const auto x2 = x1*y; + const auto x3 = x1*z; + const auto x4 = y*y; + const auto x5 = y*z; + const auto x6 = radial_eval_alpha*x5; + const auto x7 = z*z; + const auto x8 = 3.0*radial_eval_alpha; + const auto x9 = radial_eval_alpha_squared*x0; + const auto x10 = radial_eval_alpha + x9; + const auto x11 = x10*y; + const auto x12 = x10*z; + const auto x13 = radial_eval_alpha_squared*x4; + const auto x14 = radial_eval_alpha + x13; + const auto x15 = x*x14; + const auto x16 = radial_eval_alpha_squared*x*x5; + const auto x17 = radial_eval_alpha_squared*x7; + const auto x18 = radial_eval_alpha + x17; + const auto x19 = x*x18; + const auto x20 = x14*z; + const auto x21 = x18*y; + const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x9; + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*x; @@ -113,24 +140,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x*x; - basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*y; - basis_x_eval[ipt + 2*npts] = radial_eval_alpha*x*z; + basis_x_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x0; + basis_x_eval[ipt + 1*npts] = x2; + basis_x_eval[ipt + 2*npts] = x3; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*y; - basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*y*y; - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*y*z; + basis_y_eval[ipt + 0*npts] = x2; + basis_y_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x4; + basis_y_eval[ipt + 2*npts] = x6; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*y*z; - basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*z*z; + basis_z_eval[ipt + 0*npts] = x3; + basis_z_eval[ipt + 1*npts] = x6; + basis_z_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x7; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = x*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 1*npts] = y*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 2*npts] = z*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); + basis_lapl_eval[ipt + 0*npts] = x*x22; + basis_lapl_eval[ipt + 1*npts] = x22*y; + basis_lapl_eval[ipt + 2*npts] = x22*z; + @@ -157,15 +186,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; - dang_eval_x_0 = radial_eval + radial_eval_alpha*x*x; - dang_eval_y_0 = radial_eval_alpha*x*y; - dang_eval_z_0 = radial_eval_alpha*x*z; - dang_eval_x_1 = radial_eval_alpha*x*y; - dang_eval_y_1 = radial_eval + radial_eval_alpha*y*y; - dang_eval_z_1 = radial_eval_alpha*y*z; - dang_eval_x_2 = radial_eval_alpha*x*z; - dang_eval_y_2 = radial_eval_alpha*y*z; - dang_eval_z_2 = radial_eval + radial_eval_alpha*z*z; + dang_eval_x_0 = radial_eval + radial_eval_alpha*x0; + dang_eval_y_0 = x2; + dang_eval_z_0 = x3; + dang_eval_x_1 = x2; + dang_eval_y_1 = radial_eval + radial_eval_alpha*x4; + dang_eval_z_1 = x6; + dang_eval_x_2 = x3; + dang_eval_y_2 = x6; + dang_eval_z_2 = radial_eval + radial_eval_alpha*x7; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp index b50b7c21..504a0c4a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,15 +96,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + const auto x0 = radial_eval*x; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*z; - basis_eval[ipt + 3*npts] = radial_eval*y*y; + basis_eval[ipt + 0*npts] = radial_eval*(x*x); + basis_eval[ipt + 1*npts] = x0*y; + basis_eval[ipt + 2*npts] = x0*z; + basis_eval[ipt + 3*npts] = radial_eval*(y*y); basis_eval[ipt + 4*npts] = radial_eval*y*z; - basis_eval[ipt + 5*npts] = radial_eval*z*z; + basis_eval[ipt + 5*npts] = radial_eval*(z*z); @@ -110,6 +115,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn @@ -121,17 +128,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x; - ang_eval_1 = radial_eval*x*y; - ang_eval_2 = radial_eval*x*z; - ang_eval_3 = radial_eval*y*y; + ang_eval_0 = radial_eval*(x*x); + ang_eval_1 = x0*y; + ang_eval_2 = x0*z; + ang_eval_3 = radial_eval*(y*y); basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; ang_eval_0 = radial_eval*y*z; - ang_eval_1 = radial_eval*z*z; + ang_eval_1 = radial_eval*(z*z); basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp index 4c640ddd..8eb1cdc6 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_2( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_2( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,41 +102,56 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = radial_eval*x; + const auto x2 = y*y; + const auto x3 = radial_eval*y; + const auto x4 = z*z; + const auto x5 = radial_eval + radial_eval_alpha*x0; + const auto x6 = radial_eval_alpha*x; + const auto x7 = x6*y*z; + const auto x8 = radial_eval_alpha*y; + const auto x9 = radial_eval + radial_eval_alpha*x2; + const auto x10 = radial_eval_alpha*z; + const auto x11 = radial_eval + radial_eval_alpha*x4; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*z; - basis_eval[ipt + 3*npts] = radial_eval*y*y; - basis_eval[ipt + 4*npts] = radial_eval*y*z; - basis_eval[ipt + 5*npts] = radial_eval*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*y; + basis_eval[ipt + 2*npts] = x1*z; + basis_eval[ipt + 3*npts] = radial_eval*x2; + basis_eval[ipt + 4*npts] = x3*z; + basis_eval[ipt + 5*npts] = radial_eval*x4; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y; - basis_x_eval[ipt + 4*npts] = radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 5*npts] = radial_eval_alpha*x*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*(x*x*x) + 2.0*x1; + basis_x_eval[ipt + 1*npts] = x5*y; + basis_x_eval[ipt + 2*npts] = x5*z; + basis_x_eval[ipt + 3*npts] = x2*x6; + basis_x_eval[ipt + 4*npts] = x7; + basis_x_eval[ipt + 5*npts] = x4*x6; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*y; - basis_y_eval[ipt + 1*npts] = x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y*z; - basis_y_eval[ipt + 3*npts] = y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*y*z*z; + basis_y_eval[ipt + 0*npts] = x0*x8; + basis_y_eval[ipt + 1*npts] = x*x9; + basis_y_eval[ipt + 2*npts] = x7; + basis_y_eval[ipt + 3*npts] = radial_eval_alpha*(y*y*y) + 2.0*x3; + basis_y_eval[ipt + 4*npts] = x9*z; + basis_y_eval[ipt + 5*npts] = x4*x8; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*y*z; - basis_z_eval[ipt + 2*npts] = x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*y*y*z; - basis_z_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x10; + basis_z_eval[ipt + 1*npts] = x7; + basis_z_eval[ipt + 2*npts] = x*x11; + basis_z_eval[ipt + 3*npts] = x10*x2; + basis_z_eval[ipt + 4*npts] = x11*y; + basis_z_eval[ipt + 5*npts] = z*(2.0*radial_eval + radial_eval_alpha*(z*z)); + + @@ -150,17 +168,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x; - ang_eval_1 = radial_eval*x*y; - ang_eval_2 = radial_eval*x*z; - ang_eval_3 = radial_eval*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*y; + ang_eval_2 = x1*z; + ang_eval_3 = radial_eval*x2; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z; - ang_eval_1 = radial_eval*z*z; + ang_eval_0 = x3*z; + ang_eval_1 = radial_eval*x4; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; @@ -170,18 +188,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*z; - dang_eval_x_1 = y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*y*z; - dang_eval_x_2 = z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*y*z; - dang_eval_z_2 = x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = radial_eval_alpha*x*y*y; - dang_eval_y_3 = y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*y*y*z; + dang_eval_x_0 = radial_eval_alpha*(x*x*x) + 2.0*x1; + dang_eval_y_0 = x0*x8; + dang_eval_z_0 = x0*x10; + dang_eval_x_1 = x5*y; + dang_eval_y_1 = x*x9; + dang_eval_z_1 = x7; + dang_eval_x_2 = x5*z; + dang_eval_y_2 = x7; + dang_eval_z_2 = x*x11; + dang_eval_x_3 = x2*x6; + dang_eval_y_3 = radial_eval_alpha*(y*y*y) + 2.0*x3; + dang_eval_z_3 = x10*x2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -195,12 +213,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*z; - dang_eval_y_0 = z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*z*z; - dang_eval_y_1 = radial_eval_alpha*y*z*z; - dang_eval_z_1 = z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x7; + dang_eval_y_0 = x9*z; + dang_eval_z_0 = x11*y; + dang_eval_x_1 = x4*x6; + dang_eval_y_1 = x4*x8; + dang_eval_z_1 = z*(2.0*radial_eval + radial_eval_alpha*(z*z)); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp index d4b05d5e..8c76ac54 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_2( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_2( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,89 +111,138 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = x*y; + const auto x2 = x*z; + const auto x3 = y*y; + const auto x4 = y*z; + const auto x5 = z*z; + const auto x6 = 2.0*radial_eval; + const auto x7 = x*x*x; + const auto x8 = radial_eval + radial_eval_alpha*x0; + const auto x9 = radial_eval_alpha*x; + const auto x10 = x4*x9; + const auto x11 = radial_eval_alpha*y; + const auto x12 = radial_eval_alpha*x3; + const auto x13 = radial_eval + x12; + const auto x14 = y*y*y; + const auto x15 = radial_eval_alpha*z; + const auto x16 = radial_eval_alpha*x5; + const auto x17 = radial_eval + x16; + const auto x18 = z*z*z; + const auto x19 = 4.0*radial_eval_alpha; + const auto x20 = radial_eval_alpha_squared*x0; + const auto x21 = radial_eval_alpha + x20; + const auto x22 = x0*x19 + x0*x21 + x6; + const auto x23 = 3.0*radial_eval_alpha; + const auto x24 = x20 + x23; + const auto x25 = x21*x3; + const auto x26 = x21*x4; + const auto x27 = x21*x5; + const auto x28 = radial_eval_alpha_squared*x7 + 2.0*x9; + const auto x29 = radial_eval_alpha_squared*x14 + 2.0*x11; + const auto x30 = radial_eval_alpha_squared*x3; + const auto x31 = radial_eval_alpha + x30; + const auto x32 = x2*x31; + const auto x33 = radial_eval_alpha_squared*x5; + const auto x34 = radial_eval_alpha + x33; + const auto x35 = x1*x34; + const auto x36 = radial_eval_alpha_squared*x18 + 2.0*x15; + const auto x37 = x0*x31; + const auto x38 = x23 + x30; + const auto x39 = x19*x3 + x3*x31 + x6; + const auto x40 = x31*x5; + const auto x41 = x0*x34; + const auto x42 = x23 + x33; + const auto x43 = x3*x34; + const auto x44 = x19*x5 + x34*x5 + x6; + const auto x45 = 7.0*radial_eval_alpha + x20 + x30 + x33; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*z; - basis_eval[ipt + 3*npts] = radial_eval*y*y; - basis_eval[ipt + 4*npts] = radial_eval*y*z; - basis_eval[ipt + 5*npts] = radial_eval*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = radial_eval*x1; + basis_eval[ipt + 2*npts] = radial_eval*x2; + basis_eval[ipt + 3*npts] = radial_eval*x3; + basis_eval[ipt + 4*npts] = radial_eval*x4; + basis_eval[ipt + 5*npts] = radial_eval*x5; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y; - basis_x_eval[ipt + 4*npts] = radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 5*npts] = radial_eval_alpha*x*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x7 + x*x6; + basis_x_eval[ipt + 1*npts] = x8*y; + basis_x_eval[ipt + 2*npts] = x8*z; + basis_x_eval[ipt + 3*npts] = x3*x9; + basis_x_eval[ipt + 4*npts] = x10; + basis_x_eval[ipt + 5*npts] = x5*x9; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*y; - basis_y_eval[ipt + 1*npts] = x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y*z; - basis_y_eval[ipt + 3*npts] = y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*y*z*z; + basis_y_eval[ipt + 0*npts] = x0*x11; + basis_y_eval[ipt + 1*npts] = x*x13; + basis_y_eval[ipt + 2*npts] = x10; + basis_y_eval[ipt + 3*npts] = radial_eval_alpha*x14 + x6*y; + basis_y_eval[ipt + 4*npts] = x13*z; + basis_y_eval[ipt + 5*npts] = x11*x5; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*y*z; - basis_z_eval[ipt + 2*npts] = x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*y*y*z; - basis_z_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x15; + basis_z_eval[ipt + 1*npts] = x10; + basis_z_eval[ipt + 2*npts] = x*x17; + basis_z_eval[ipt + 3*npts] = x15*x3; + basis_z_eval[ipt + 4*npts] = x17*y; + basis_z_eval[ipt + 5*npts] = radial_eval_alpha*x18 + x6*z; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = 2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x; - basis_xx_eval[ipt + 1*npts] = x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 3*npts] = y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 4*npts] = y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 5*npts] = z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 0*npts] = x22; + basis_xx_eval[ipt + 1*npts] = x1*x24; + basis_xx_eval[ipt + 2*npts] = x2*x24; + basis_xx_eval[ipt + 3*npts] = x25; + basis_xx_eval[ipt + 4*npts] = x26; + basis_xx_eval[ipt + 5*npts] = x27; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = x*y*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y; - basis_xy_eval[ipt + 2*npts] = y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 3*npts] = x*y*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 4*npts] = x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x*y*z*z; + basis_xy_eval[ipt + 0*npts] = x28*y; + basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x3 + x12 + x8; + basis_xy_eval[ipt + 2*npts] = x26; + basis_xy_eval[ipt + 3*npts] = x*x29; + basis_xy_eval[ipt + 4*npts] = x32; + basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x1*x5; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = x*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 1*npts] = y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z; - basis_xz_eval[ipt + 3*npts] = radial_eval_alpha_squared*x*y*y*z; - basis_xz_eval[ipt + 4*npts] = x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 5*npts] = x*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 0*npts] = x28*z; + basis_xz_eval[ipt + 1*npts] = x26; + basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x5 + x16 + x8; + basis_xz_eval[ipt + 3*npts] = radial_eval_alpha_squared*x2*x3; + basis_xz_eval[ipt + 4*npts] = x35; + basis_xz_eval[ipt + 5*npts] = x*x36; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 1*npts] = x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 3*npts] = 2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y; - basis_yy_eval[ipt + 4*npts] = y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 5*npts] = z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 0*npts] = x37; + basis_yy_eval[ipt + 1*npts] = x1*x38; + basis_yy_eval[ipt + 2*npts] = x32; + basis_yy_eval[ipt + 3*npts] = x39; + basis_yy_eval[ipt + 4*npts] = x38*x4; + basis_yy_eval[ipt + 5*npts] = x40; // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*y*z; - basis_yz_eval[ipt + 1*npts] = x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 2*npts] = x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 3*npts] = y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 4*npts] = radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z; - basis_yz_eval[ipt + 5*npts] = y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x4; + basis_yz_eval[ipt + 1*npts] = x32; + basis_yz_eval[ipt + 2*npts] = x35; + basis_yz_eval[ipt + 3*npts] = x29*z; + basis_yz_eval[ipt + 4*npts] = radial_eval_alpha_squared*x3*x5 + x13 + x16; + basis_yz_eval[ipt + 5*npts] = x36*y; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 1*npts] = x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 3*npts] = y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 4*npts] = y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 5*npts] = 2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z; + basis_zz_eval[ipt + 0*npts] = x41; + basis_zz_eval[ipt + 1*npts] = x35; + basis_zz_eval[ipt + 2*npts] = x2*x42; + basis_zz_eval[ipt + 3*npts] = x43; + basis_zz_eval[ipt + 4*npts] = x4*x42; + basis_zz_eval[ipt + 5*npts] = x44; + + @@ -206,17 +258,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x; - ang_eval_1 = radial_eval*x*y; - ang_eval_2 = radial_eval*x*z; - ang_eval_3 = radial_eval*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = radial_eval*x1; + ang_eval_2 = radial_eval*x2; + ang_eval_3 = radial_eval*x3; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z; - ang_eval_1 = radial_eval*z*z; + ang_eval_0 = radial_eval*x4; + ang_eval_1 = radial_eval*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; @@ -226,18 +278,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*z; - dang_eval_x_1 = y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*y*z; - dang_eval_x_2 = z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*y*z; - dang_eval_z_2 = x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = radial_eval_alpha*x*y*y; - dang_eval_y_3 = y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*y*y*z; + dang_eval_x_0 = radial_eval_alpha*x7 + x*x6; + dang_eval_y_0 = x0*x11; + dang_eval_z_0 = x0*x15; + dang_eval_x_1 = x8*y; + dang_eval_y_1 = x*x13; + dang_eval_z_1 = x10; + dang_eval_x_2 = x8*z; + dang_eval_y_2 = x10; + dang_eval_z_2 = x*x17; + dang_eval_x_3 = x3*x9; + dang_eval_y_3 = radial_eval_alpha*x14 + x6*y; + dang_eval_z_3 = x15*x3; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -251,12 +303,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*z; - dang_eval_y_0 = z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*z*z; - dang_eval_y_1 = radial_eval_alpha*y*z*z; - dang_eval_z_1 = z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x10; + dang_eval_y_0 = x13*z; + dang_eval_z_0 = x17*y; + dang_eval_x_1 = x5*x9; + dang_eval_y_1 = x11*x5; + dang_eval_z_1 = radial_eval_alpha*x18 + x6*z; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp new file mode 100644 index 00000000..faa65ea8 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp @@ -0,0 +1,400 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_2( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = x0; + const auto x2 = x*y; + const auto x3 = x*z; + const auto x4 = y*y; + const auto x5 = x4; + const auto x6 = y*z; + const auto x7 = z*z; + const auto x8 = x7; + const auto x9 = 2.0*radial_eval; + const auto x10 = x*x*x; + const auto x11 = radial_eval + radial_eval_alpha*x1; + const auto x12 = radial_eval_alpha*x; + const auto x13 = x12*x6; + const auto x14 = radial_eval_alpha*y; + const auto x15 = radial_eval_alpha*x5; + const auto x16 = radial_eval + x15; + const auto x17 = y*y*y; + const auto x18 = radial_eval_alpha*z; + const auto x19 = radial_eval_alpha*x8; + const auto x20 = radial_eval + x19; + const auto x21 = z*z*z; + const auto x22 = 4.0*radial_eval_alpha; + const auto x23 = radial_eval_alpha_squared*x1; + const auto x24 = radial_eval_alpha + x23; + const auto x25 = x1*x22 + x1*x24 + x9; + const auto x26 = 3.0*radial_eval_alpha; + const auto x27 = x23 + x26; + const auto x28 = x24*x5; + const auto x29 = x24*x6; + const auto x30 = x24*x8; + const auto x31 = 2.0*x12; + const auto x32 = radial_eval_alpha_squared*x10 + x31; + const auto x33 = 2.0*x14; + const auto x34 = radial_eval_alpha_squared*x17 + x33; + const auto x35 = radial_eval_alpha_squared*x5; + const auto x36 = radial_eval_alpha + x35; + const auto x37 = x3*x36; + const auto x38 = radial_eval_alpha_squared*x8; + const auto x39 = radial_eval_alpha + x38; + const auto x40 = x2*x39; + const auto x41 = 2.0*x18; + const auto x42 = radial_eval_alpha_squared*x21 + x41; + const auto x43 = x1*x36; + const auto x44 = x26 + x35; + const auto x45 = x22*x5 + x36*x5 + x9; + const auto x46 = x36*x8; + const auto x47 = x1*x39; + const auto x48 = x26 + x38; + const auto x49 = x39*x5; + const auto x50 = x22*x8 + x39*x8 + x9; + const auto x51 = x35 + x38; + const auto x52 = 7.0*radial_eval_alpha + x23 + x51; + const auto x53 = 2.0*x; + const auto x54 = radial_eval_alpha_cubed*x5 + radial_eval_alpha_squared; + const auto x55 = x1*x54; + const auto x56 = radial_eval_alpha_cubed*x8 + radial_eval_alpha_squared; + const auto x57 = x1*x56; + const auto x58 = radial_eval_alpha_squared*x; + const auto x59 = radial_eval_alpha_cubed*x10; + const auto x60 = 3.0*x58 + x59; + const auto x61 = 2.0*radial_eval_alpha_squared; + const auto x62 = x*x60 + x0*x54 + x0*x56 + x1*x61 + x22 + 3.0*x24 + x51; + const auto x63 = 4.0*x58; + const auto x64 = x5*x54; + const auto x65 = x5*x56; + const auto x66 = x54*x8; + const auto x67 = x56*x8; + const auto x68 = radial_eval_alpha_squared*y; + const auto x69 = 4.0*x68; + const auto x70 = radial_eval_alpha_cubed*x1 + radial_eval_alpha_squared; + const auto x71 = x1*x70; + const auto x72 = radial_eval_alpha_cubed*x17; + const auto x73 = 3.0*x68 + x72; + const auto x74 = x22 + x23; + const auto x75 = 3.0*x36 + x38 + x4*x56 + x4*x70 + x5*x61 + x73*y + x74; + const auto x76 = 2.0*y; + const auto x77 = x5*x70; + const auto x78 = x70*x8; + const auto x79 = radial_eval_alpha_squared*z; + const auto x80 = 4.0*x79; + const auto x81 = radial_eval_alpha_cubed*x21; + const auto x82 = 3.0*x79 + x81; + const auto x83 = x35 + 3.0*x39 + x54*x7 + x61*x8 + x7*x70 + x74 + x82*z; + const auto x84 = 2.0*z; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x1; + basis_eval[ipt + 1*npts] = radial_eval*x2; + basis_eval[ipt + 2*npts] = radial_eval*x3; + basis_eval[ipt + 3*npts] = radial_eval*x5; + basis_eval[ipt + 4*npts] = radial_eval*x6; + basis_eval[ipt + 5*npts] = radial_eval*x8; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x10 + x*x9; + basis_x_eval[ipt + 1*npts] = x11*y; + basis_x_eval[ipt + 2*npts] = x11*z; + basis_x_eval[ipt + 3*npts] = x12*x5; + basis_x_eval[ipt + 4*npts] = x13; + basis_x_eval[ipt + 5*npts] = x12*x8; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x1*x14; + basis_y_eval[ipt + 1*npts] = x*x16; + basis_y_eval[ipt + 2*npts] = x13; + basis_y_eval[ipt + 3*npts] = radial_eval_alpha*x17 + x9*y; + basis_y_eval[ipt + 4*npts] = x16*z; + basis_y_eval[ipt + 5*npts] = x14*x8; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x1*x18; + basis_z_eval[ipt + 1*npts] = x13; + basis_z_eval[ipt + 2*npts] = x*x20; + basis_z_eval[ipt + 3*npts] = x18*x5; + basis_z_eval[ipt + 4*npts] = x20*y; + basis_z_eval[ipt + 5*npts] = radial_eval_alpha*x21 + x9*z; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x25; + basis_xx_eval[ipt + 1*npts] = x2*x27; + basis_xx_eval[ipt + 2*npts] = x27*x3; + basis_xx_eval[ipt + 3*npts] = x28; + basis_xx_eval[ipt + 4*npts] = x29; + basis_xx_eval[ipt + 5*npts] = x30; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x32*y; + basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x1*x5 + x11 + x15; + basis_xy_eval[ipt + 2*npts] = x29; + basis_xy_eval[ipt + 3*npts] = x*x34; + basis_xy_eval[ipt + 4*npts] = x37; + basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x2*x8; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x32*z; + basis_xz_eval[ipt + 1*npts] = x29; + basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x1*x8 + x11 + x19; + basis_xz_eval[ipt + 3*npts] = radial_eval_alpha_squared*x3*x5; + basis_xz_eval[ipt + 4*npts] = x40; + basis_xz_eval[ipt + 5*npts] = x*x42; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x43; + basis_yy_eval[ipt + 1*npts] = x2*x44; + basis_yy_eval[ipt + 2*npts] = x37; + basis_yy_eval[ipt + 3*npts] = x45; + basis_yy_eval[ipt + 4*npts] = x44*x6; + basis_yy_eval[ipt + 5*npts] = x46; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x1*x6; + basis_yz_eval[ipt + 1*npts] = x37; + basis_yz_eval[ipt + 2*npts] = x40; + basis_yz_eval[ipt + 3*npts] = x34*z; + basis_yz_eval[ipt + 4*npts] = radial_eval_alpha_squared*x5*x8 + x16 + x19; + basis_yz_eval[ipt + 5*npts] = x42*y; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x47; + basis_zz_eval[ipt + 1*npts] = x40; + basis_zz_eval[ipt + 2*npts] = x3*x48; + basis_zz_eval[ipt + 3*npts] = x49; + basis_zz_eval[ipt + 4*npts] = x48*x6; + basis_zz_eval[ipt + 5*npts] = x50; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x25 + x43 + x47; + basis_lapl_eval[ipt + 1*npts] = x2*x52; + basis_lapl_eval[ipt + 2*npts] = x3*x52; + basis_lapl_eval[ipt + 3*npts] = x28 + x45 + x49; + basis_lapl_eval[ipt + 4*npts] = x52*x6; + basis_lapl_eval[ipt + 5*npts] = x30 + x46 + x50; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = 6.0*x*x24 + x*x55 + x*x57 + x1*x60 + 6.0*x12 + x36*x53 + x39*x53; + basis_lapl_x_eval[ipt + 1*npts] = x62*y; + basis_lapl_x_eval[ipt + 2*npts] = x62*z; + basis_lapl_x_eval[ipt + 3*npts] = x*x64 + x*x65 + x31 + x5*x60 + x5*x63; + basis_lapl_x_eval[ipt + 4*npts] = x6*(x*x54 + x*x56 + 7.0*x58 + x59); + basis_lapl_x_eval[ipt + 5*npts] = x*x66 + x*x67 + x31 + x60*x8 + x63*x8; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x1*x69 + x1*x73 + x33 + x57*y + x71*y; + basis_lapl_y_eval[ipt + 1*npts] = x*x75; + basis_lapl_y_eval[ipt + 2*npts] = x3*(x56*y + 7.0*x68 + x70*y + x72); + basis_lapl_y_eval[ipt + 3*npts] = 6.0*x14 + x24*x76 + 6.0*x36*y + x39*x76 + x5*x73 + x65*y + x77*y; + basis_lapl_y_eval[ipt + 4*npts] = x75*z; + basis_lapl_y_eval[ipt + 5*npts] = x33 + x67*y + x69*x8 + x73*x8 + x78*y; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x1*x80 + x1*x82 + x41 + x55*z + x71*z; + basis_lapl_z_eval[ipt + 1*npts] = x2*(x54*z + x70*z + 7.0*x79 + x81); + basis_lapl_z_eval[ipt + 2*npts] = x*x83; + basis_lapl_z_eval[ipt + 3*npts] = x41 + x5*x80 + x5*x82 + x64*z + x77*z; + basis_lapl_z_eval[ipt + 4*npts] = x83*y; + basis_lapl_z_eval[ipt + 5*npts] = 6.0*x18 + x24*x84 + x36*x84 + 6.0*x39*z + x66*z + x78*z + x8*x82; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x1; + ang_eval_1 = radial_eval*x2; + ang_eval_2 = radial_eval*x3; + ang_eval_3 = radial_eval*x5; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x6; + ang_eval_1 = radial_eval*x8; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x10 + x*x9; + dang_eval_y_0 = x1*x14; + dang_eval_z_0 = x1*x18; + dang_eval_x_1 = x11*y; + dang_eval_y_1 = x*x16; + dang_eval_z_1 = x13; + dang_eval_x_2 = x11*z; + dang_eval_y_2 = x13; + dang_eval_z_2 = x*x20; + dang_eval_x_3 = x12*x5; + dang_eval_y_3 = radial_eval_alpha*x17 + x9*y; + dang_eval_z_3 = x18*x5; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x13; + dang_eval_y_0 = x16*z; + dang_eval_z_0 = x20*y; + dang_eval_x_1 = x12*x8; + dang_eval_y_1 = x14*x8; + dang_eval_z_1 = radial_eval_alpha*x21 + x9*z; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp index 7e3b759f..e789b5ff 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_2( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_2( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,49 +106,98 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = x*y; + const auto x2 = x*z; + const auto x3 = y*y; + const auto x4 = y*z; + const auto x5 = z*z; + const auto x6 = 2.0*radial_eval; + const auto x7 = x*x*x; + const auto x8 = radial_eval + radial_eval_alpha*x0; + const auto x9 = radial_eval_alpha*x; + const auto x10 = x4*x9; + const auto x11 = radial_eval_alpha*y; + const auto x12 = radial_eval_alpha*x3; + const auto x13 = radial_eval + x12; + const auto x14 = y*y*y; + const auto x15 = radial_eval_alpha*z; + const auto x16 = radial_eval_alpha*x5; + const auto x17 = radial_eval + x16; + const auto x18 = z*z*z; + const auto x19 = 4.0*radial_eval_alpha; + const auto x20 = radial_eval_alpha_squared*x0; + const auto x21 = radial_eval_alpha + x20; + const auto x22 = x0*x19 + x0*x21 + x6; + const auto x23 = 3.0*radial_eval_alpha; + const auto x24 = x20 + x23; + const auto x25 = x21*x3; + const auto x26 = x21*x4; + const auto x27 = x21*x5; + const auto x28 = radial_eval_alpha_squared*x7 + 2.0*x9; + const auto x29 = radial_eval_alpha_squared*x14 + 2.0*x11; + const auto x30 = radial_eval_alpha_squared*x3; + const auto x31 = radial_eval_alpha + x30; + const auto x32 = x2*x31; + const auto x33 = radial_eval_alpha_squared*x5; + const auto x34 = radial_eval_alpha + x33; + const auto x35 = x1*x34; + const auto x36 = radial_eval_alpha_squared*x18 + 2.0*x15; + const auto x37 = x0*x31; + const auto x38 = x23 + x30; + const auto x39 = x19*x3 + x3*x31 + x6; + const auto x40 = x31*x5; + const auto x41 = x0*x34; + const auto x42 = x23 + x33; + const auto x43 = x3*x34; + const auto x44 = x19*x5 + x34*x5 + x6; + const auto x45 = 7.0*radial_eval_alpha + x20 + x30 + x33; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*z; - basis_eval[ipt + 3*npts] = radial_eval*y*y; - basis_eval[ipt + 4*npts] = radial_eval*y*z; - basis_eval[ipt + 5*npts] = radial_eval*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = radial_eval*x1; + basis_eval[ipt + 2*npts] = radial_eval*x2; + basis_eval[ipt + 3*npts] = radial_eval*x3; + basis_eval[ipt + 4*npts] = radial_eval*x4; + basis_eval[ipt + 5*npts] = radial_eval*x5; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y; - basis_x_eval[ipt + 4*npts] = radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 5*npts] = radial_eval_alpha*x*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x7 + x*x6; + basis_x_eval[ipt + 1*npts] = x8*y; + basis_x_eval[ipt + 2*npts] = x8*z; + basis_x_eval[ipt + 3*npts] = x3*x9; + basis_x_eval[ipt + 4*npts] = x10; + basis_x_eval[ipt + 5*npts] = x5*x9; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*y; - basis_y_eval[ipt + 1*npts] = x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y*z; - basis_y_eval[ipt + 3*npts] = y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*y*z*z; + basis_y_eval[ipt + 0*npts] = x0*x11; + basis_y_eval[ipt + 1*npts] = x*x13; + basis_y_eval[ipt + 2*npts] = x10; + basis_y_eval[ipt + 3*npts] = radial_eval_alpha*x14 + x6*y; + basis_y_eval[ipt + 4*npts] = x13*z; + basis_y_eval[ipt + 5*npts] = x11*x5; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*y*z; - basis_z_eval[ipt + 2*npts] = x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*y*y*z; - basis_z_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = z*(2*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x15; + basis_z_eval[ipt + 1*npts] = x10; + basis_z_eval[ipt + 2*npts] = x*x17; + basis_z_eval[ipt + 3*npts] = x15*x3; + basis_z_eval[ipt + 4*npts] = x17*y; + basis_z_eval[ipt + 5*npts] = radial_eval_alpha*x18 + x6*z; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = 2*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z; - basis_lapl_eval[ipt + 1*npts] = x*y*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 2*npts] = x*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 3*npts] = 2*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z; - basis_lapl_eval[ipt + 4*npts] = y*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 5*npts] = 2*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z; + basis_lapl_eval[ipt + 0*npts] = x22 + x37 + x41; + basis_lapl_eval[ipt + 1*npts] = x1*x45; + basis_lapl_eval[ipt + 2*npts] = x2*x45; + basis_lapl_eval[ipt + 3*npts] = x25 + x39 + x43; + basis_lapl_eval[ipt + 4*npts] = x4*x45; + basis_lapl_eval[ipt + 5*npts] = x27 + x40 + x44; + @@ -161,17 +213,17 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x; - ang_eval_1 = radial_eval*x*y; - ang_eval_2 = radial_eval*x*z; - ang_eval_3 = radial_eval*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = radial_eval*x1; + ang_eval_2 = radial_eval*x2; + ang_eval_3 = radial_eval*x3; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z; - ang_eval_1 = radial_eval*z*z; + ang_eval_0 = radial_eval*x4; + ang_eval_1 = radial_eval*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; @@ -181,18 +233,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*z; - dang_eval_x_1 = y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*y*z; - dang_eval_x_2 = z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*y*z; - dang_eval_z_2 = x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = radial_eval_alpha*x*y*y; - dang_eval_y_3 = y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*y*y*z; + dang_eval_x_0 = radial_eval_alpha*x7 + x*x6; + dang_eval_y_0 = x0*x11; + dang_eval_z_0 = x0*x15; + dang_eval_x_1 = x8*y; + dang_eval_y_1 = x*x13; + dang_eval_z_1 = x10; + dang_eval_x_2 = x8*z; + dang_eval_y_2 = x10; + dang_eval_z_2 = x*x17; + dang_eval_x_3 = x3*x9; + dang_eval_y_3 = radial_eval_alpha*x14 + x6*y; + dang_eval_z_3 = x15*x3; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -206,12 +258,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*z; - dang_eval_y_0 = z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*z*z; - dang_eval_y_1 = radial_eval_alpha*y*z*z; - dang_eval_z_1 = z*(2*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x10; + dang_eval_y_0 = x13*z; + dang_eval_z_0 = x17*y; + dang_eval_x_1 = x5*x9; + dang_eval_y_1 = x11*x5; + dang_eval_z_1 = radial_eval_alpha*x18 + x6*z; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp index 9b180257..38339b69 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,19 +96,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + const auto x0 = radial_eval*y; + const auto x1 = x*x; + const auto x2 = radial_eval*z; + const auto x3 = radial_eval*x; + const auto x4 = y*y; + const auto x5 = z*z; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*(x*x*x); + basis_eval[ipt + 1*npts] = x0*x1; + basis_eval[ipt + 2*npts] = x1*x2; + basis_eval[ipt + 3*npts] = x3*x4; + basis_eval[ipt + 4*npts] = x*x0*z; + basis_eval[ipt + 5*npts] = x3*x5; + basis_eval[ipt + 6*npts] = radial_eval*(y*y*y); + basis_eval[ipt + 7*npts] = x2*x4; + basis_eval[ipt + 8*npts] = x0*x5; + basis_eval[ipt + 9*npts] = radial_eval*(z*z*z); @@ -114,6 +124,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn @@ -125,26 +137,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x; - ang_eval_1 = radial_eval*x*x*y; - ang_eval_2 = radial_eval*x*x*z; - ang_eval_3 = radial_eval*x*y*y; + ang_eval_0 = radial_eval*(x*x*x); + ang_eval_1 = x0*x1; + ang_eval_2 = x1*x2; + ang_eval_3 = x3*x4; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z; - ang_eval_1 = radial_eval*x*z*z; - ang_eval_2 = radial_eval*y*y*y; - ang_eval_3 = radial_eval*y*y*z; + ang_eval_0 = x*x0*z; + ang_eval_1 = x3*x5; + ang_eval_2 = radial_eval*(y*y*y); + ang_eval_3 = x2*x4; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z*z; - ang_eval_1 = radial_eval*z*z*z; + ang_eval_0 = x0*x5; + ang_eval_1 = radial_eval*(z*z*z); basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp index 459f84e9..633e1bb8 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_3( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_3( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,57 +102,87 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x; + const auto x3 = radial_eval*z; + const auto x4 = radial_eval*x; + const auto x5 = y*y; + const auto x6 = x*z; + const auto x7 = z*z; + const auto x8 = y*y*y; + const auto x9 = z*z*z; + const auto x10 = 3.0*radial_eval; + const auto x11 = radial_eval_alpha*x0 + 2.0*x4; + const auto x12 = radial_eval*x5; + const auto x13 = radial_eval_alpha*x2*x5; + const auto x14 = y*z; + const auto x15 = radial_eval*x7; + const auto x16 = radial_eval_alpha*x2*x7; + const auto x17 = radial_eval_alpha*x; + const auto x18 = x17*x5*z; + const auto x19 = x17*x7*y; + const auto x20 = radial_eval_alpha*y; + const auto x21 = radial_eval*x2; + const auto x22 = radial_eval_alpha*x14*x2; + const auto x23 = radial_eval_alpha*x8 + 2.0*x1; + const auto x24 = radial_eval_alpha*x5*x7; + const auto x25 = radial_eval_alpha*z; + const auto x26 = radial_eval_alpha*x9 + 2.0*x3; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = x4*x5; + basis_eval[ipt + 4*npts] = x1*x6; + basis_eval[ipt + 5*npts] = x4*x7; + basis_eval[ipt + 6*npts] = radial_eval*x8; + basis_eval[ipt + 7*npts] = x3*x5; + basis_eval[ipt + 8*npts] = x1*x7; + basis_eval[ipt + 9*npts] = radial_eval*x9; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = y*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 5*npts] = z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y; - basis_x_eval[ipt + 7*npts] = radial_eval_alpha*x*y*y*z; - basis_x_eval[ipt + 8*npts] = radial_eval_alpha*x*y*z*z; - basis_x_eval[ipt + 9*npts] = radial_eval_alpha*x*z*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*(x*x*x*x) + x10*x2; + basis_x_eval[ipt + 1*npts] = x11*y; + basis_x_eval[ipt + 2*npts] = x11*z; + basis_x_eval[ipt + 3*npts] = x12 + x13; + basis_x_eval[ipt + 4*npts] = x14*(radial_eval + radial_eval_alpha*x2); + basis_x_eval[ipt + 5*npts] = x15 + x16; + basis_x_eval[ipt + 6*npts] = x17*x8; + basis_x_eval[ipt + 7*npts] = x18; + basis_x_eval[ipt + 8*npts] = x19; + basis_x_eval[ipt + 9*npts] = x17*x9; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*y; - basis_y_eval[ipt + 1*npts] = x*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*y*z; - basis_y_eval[ipt + 3*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*y*z*z; - basis_y_eval[ipt + 6*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 7*npts] = y*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 9*npts] = radial_eval_alpha*y*z*z*z; + basis_y_eval[ipt + 0*npts] = x0*x20; + basis_y_eval[ipt + 1*npts] = x13 + x21; + basis_y_eval[ipt + 2*npts] = x22; + basis_y_eval[ipt + 3*npts] = x*x23; + basis_y_eval[ipt + 4*npts] = x6*(radial_eval + radial_eval_alpha*x5); + basis_y_eval[ipt + 5*npts] = x19; + basis_y_eval[ipt + 6*npts] = radial_eval_alpha*(y*y*y*y) + x10*x5; + basis_y_eval[ipt + 7*npts] = x23*z; + basis_y_eval[ipt + 8*npts] = x15 + x24; + basis_y_eval[ipt + 9*npts] = x20*x9; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*y*z; - basis_z_eval[ipt + 2*npts] = x*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y*z; - basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = x*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 6*npts] = radial_eval_alpha*y*y*y*z; - basis_z_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 8*npts] = y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 9*npts] = z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x25; + basis_z_eval[ipt + 1*npts] = x22; + basis_z_eval[ipt + 2*npts] = x16 + x21; + basis_z_eval[ipt + 3*npts] = x18; + basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*x7); + basis_z_eval[ipt + 5*npts] = x*x26; + basis_z_eval[ipt + 6*npts] = x25*x8; + basis_z_eval[ipt + 7*npts] = x12 + x24; + basis_z_eval[ipt + 8*npts] = x26*y; + basis_z_eval[ipt + 9*npts] = radial_eval_alpha*(z*z*z*z) + x10*x7; + + @@ -166,26 +199,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x; - ang_eval_1 = radial_eval*x*x*y; - ang_eval_2 = radial_eval*x*x*z; - ang_eval_3 = radial_eval*x*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = x4*x5; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z; - ang_eval_1 = radial_eval*x*z*z; - ang_eval_2 = radial_eval*y*y*y; - ang_eval_3 = radial_eval*y*y*z; + ang_eval_0 = x1*x6; + ang_eval_1 = x4*x7; + ang_eval_2 = radial_eval*x8; + ang_eval_3 = x3*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z*z; - ang_eval_1 = radial_eval*z*z*z; + ang_eval_0 = x1*x7; + ang_eval_1 = radial_eval*x9; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; @@ -195,18 +228,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*x*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*x*z; - dang_eval_x_1 = x*y*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*x*y*z; - dang_eval_x_2 = x*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*x*y*z; - dang_eval_z_2 = x*x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = y*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*x*y*y*z; + dang_eval_x_0 = radial_eval_alpha*(x*x*x*x) + x10*x2; + dang_eval_y_0 = x0*x20; + dang_eval_z_0 = x0*x25; + dang_eval_x_1 = x11*y; + dang_eval_y_1 = x13 + x21; + dang_eval_z_1 = x22; + dang_eval_x_2 = x11*z; + dang_eval_y_2 = x22; + dang_eval_z_2 = x16 + x21; + dang_eval_x_3 = x12 + x13; + dang_eval_y_3 = x*x23; + dang_eval_z_3 = x18; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -220,18 +253,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*y*z*z; - dang_eval_z_1 = x*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*y*y*y; - dang_eval_y_2 = y*y*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*y*y*y*z; - dang_eval_x_3 = radial_eval_alpha*x*y*y*z; - dang_eval_y_3 = y*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x14*(radial_eval + radial_eval_alpha*x2); + dang_eval_y_0 = x6*(radial_eval + radial_eval_alpha*x5); + dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*x7); + dang_eval_x_1 = x15 + x16; + dang_eval_y_1 = x19; + dang_eval_z_1 = x*x26; + dang_eval_x_2 = x17*x8; + dang_eval_y_2 = radial_eval_alpha*(y*y*y*y) + x10*x5; + dang_eval_z_2 = x25*x8; + dang_eval_x_3 = x18; + dang_eval_y_3 = x23*z; + dang_eval_z_3 = x12 + x24; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -245,12 +278,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*z*z; - dang_eval_y_0 = z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*z*z*z; - dang_eval_y_1 = radial_eval_alpha*y*z*z*z; - dang_eval_z_1 = z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x19; + dang_eval_y_0 = x15 + x24; + dang_eval_z_0 = x26*y; + dang_eval_x_1 = x17*x9; + dang_eval_y_1 = x20*x9; + dang_eval_z_1 = radial_eval_alpha*(z*z*z*z) + x10*x7; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp index 31178f04..6ce4a6c3 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_3( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_3( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,129 +111,225 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x; + const auto x3 = radial_eval*z; + const auto x4 = radial_eval*x; + const auto x5 = y*y; + const auto x6 = x*z; + const auto x7 = z*z; + const auto x8 = y*y*y; + const auto x9 = z*z*z; + const auto x10 = x*x*x*x; + const auto x11 = 3.0*radial_eval; + const auto x12 = radial_eval_alpha*x0 + 2.0*x4; + const auto x13 = radial_eval*x5; + const auto x14 = x2*x5; + const auto x15 = radial_eval_alpha*x14; + const auto x16 = y*z; + const auto x17 = radial_eval_alpha*x2; + const auto x18 = radial_eval + x17; + const auto x19 = radial_eval*x7; + const auto x20 = x2*x7; + const auto x21 = radial_eval_alpha*x20; + const auto x22 = radial_eval_alpha*x; + const auto x23 = x22*x5*z; + const auto x24 = x22*x7*y; + const auto x25 = radial_eval_alpha*y; + const auto x26 = radial_eval*x2; + const auto x27 = radial_eval_alpha*x16*x2; + const auto x28 = radial_eval_alpha*x8 + 2.0*x1; + const auto x29 = radial_eval_alpha*x5; + const auto x30 = radial_eval + x29; + const auto x31 = y*y*y*y; + const auto x32 = x5*x7; + const auto x33 = radial_eval_alpha*x32; + const auto x34 = radial_eval_alpha*z; + const auto x35 = x*y; + const auto x36 = radial_eval_alpha*x7; + const auto x37 = radial_eval_alpha*x9 + 2.0*x3; + const auto x38 = z*z*z*z; + const auto x39 = 6.0*radial_eval_alpha; + const auto x40 = radial_eval_alpha_squared*x2; + const auto x41 = radial_eval_alpha + x40; + const auto x42 = x0*x39 + x0*x41 + 6.0*x4; + const auto x43 = 4.0*radial_eval_alpha; + const auto x44 = 2.0*radial_eval; + const auto x45 = x2*x41 + x44; + const auto x46 = x2*x43 + x45; + const auto x47 = 2.0*radial_eval_alpha; + const auto x48 = x47*x5; + const auto x49 = x41*x5; + const auto x50 = x*x16; + const auto x51 = 3.0*radial_eval_alpha; + const auto x52 = x47*x7; + const auto x53 = x41*x7; + const auto x54 = x41*x8; + const auto x55 = x41*x9; + const auto x56 = radial_eval_alpha_squared*x10 + x2*x51; + const auto x57 = 2.0*x22; + const auto x58 = x16*(radial_eval_alpha_squared*x0 + x57); + const auto x59 = 2.0*x25; + const auto x60 = radial_eval_alpha_squared*x14; + const auto x61 = x29 + x60; + const auto x62 = radial_eval_alpha_squared*x20; + const auto x63 = x36 + x62; + const auto x64 = radial_eval_alpha_squared*x31 + x5*x51; + const auto x65 = x6*(radial_eval_alpha_squared*x8 + x59); + const auto x66 = radial_eval_alpha_squared*x32; + const auto x67 = x36 + x66; + const auto x68 = 2.0*x34; + const auto x69 = x35*(radial_eval_alpha_squared*x9 + x68); + const auto x70 = radial_eval_alpha_squared*x38 + x51*x7; + const auto x71 = radial_eval_alpha_squared*x5; + const auto x72 = radial_eval_alpha + x71; + const auto x73 = x0*x72; + const auto x74 = x2*x47; + const auto x75 = x2*x72; + const auto x76 = x44 + x5*x72; + const auto x77 = x43*x5 + x76; + const auto x78 = x7*x72; + const auto x79 = 6.0*x1 + x39*x8 + x72*x8; + const auto x80 = x72*x9; + const auto x81 = radial_eval_alpha_squared*x7; + const auto x82 = radial_eval_alpha + x81; + const auto x83 = x0*x82; + const auto x84 = x2*x82; + const auto x85 = x5*x82; + const auto x86 = x44 + x7*x82; + const auto x87 = x43*x7 + x86; + const auto x88 = x8*x82; + const auto x89 = 6.0*x3 + x39*x9 + x82*x9; + const auto x90 = x2*x39 + x45 + x75 + x84; + const auto x91 = x39*x5 + x49 + x76 + x85; + const auto x92 = x39*x7 + x53 + x78 + x86; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = x4*x5; + basis_eval[ipt + 4*npts] = x1*x6; + basis_eval[ipt + 5*npts] = x4*x7; + basis_eval[ipt + 6*npts] = radial_eval*x8; + basis_eval[ipt + 7*npts] = x3*x5; + basis_eval[ipt + 8*npts] = x1*x7; + basis_eval[ipt + 9*npts] = radial_eval*x9; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = y*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 5*npts] = z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y; - basis_x_eval[ipt + 7*npts] = radial_eval_alpha*x*y*y*z; - basis_x_eval[ipt + 8*npts] = radial_eval_alpha*x*y*z*z; - basis_x_eval[ipt + 9*npts] = radial_eval_alpha*x*z*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x10 + x11*x2; + basis_x_eval[ipt + 1*npts] = x12*y; + basis_x_eval[ipt + 2*npts] = x12*z; + basis_x_eval[ipt + 3*npts] = x13 + x15; + basis_x_eval[ipt + 4*npts] = x16*x18; + basis_x_eval[ipt + 5*npts] = x19 + x21; + basis_x_eval[ipt + 6*npts] = x22*x8; + basis_x_eval[ipt + 7*npts] = x23; + basis_x_eval[ipt + 8*npts] = x24; + basis_x_eval[ipt + 9*npts] = x22*x9; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*y; - basis_y_eval[ipt + 1*npts] = x*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*y*z; - basis_y_eval[ipt + 3*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*y*z*z; - basis_y_eval[ipt + 6*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 7*npts] = y*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 9*npts] = radial_eval_alpha*y*z*z*z; + basis_y_eval[ipt + 0*npts] = x0*x25; + basis_y_eval[ipt + 1*npts] = x15 + x26; + basis_y_eval[ipt + 2*npts] = x27; + basis_y_eval[ipt + 3*npts] = x*x28; + basis_y_eval[ipt + 4*npts] = x30*x6; + basis_y_eval[ipt + 5*npts] = x24; + basis_y_eval[ipt + 6*npts] = radial_eval_alpha*x31 + x11*x5; + basis_y_eval[ipt + 7*npts] = x28*z; + basis_y_eval[ipt + 8*npts] = x19 + x33; + basis_y_eval[ipt + 9*npts] = x25*x9; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*y*z; - basis_z_eval[ipt + 2*npts] = x*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y*z; - basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = x*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 6*npts] = radial_eval_alpha*y*y*y*z; - basis_z_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 8*npts] = y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 9*npts] = z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x34; + basis_z_eval[ipt + 1*npts] = x27; + basis_z_eval[ipt + 2*npts] = x21 + x26; + basis_z_eval[ipt + 3*npts] = x23; + basis_z_eval[ipt + 4*npts] = x35*(radial_eval + x36); + basis_z_eval[ipt + 5*npts] = x*x37; + basis_z_eval[ipt + 6*npts] = x34*x8; + basis_z_eval[ipt + 7*npts] = x13 + x33; + basis_z_eval[ipt + 8*npts] = x37*y; + basis_z_eval[ipt + 9*npts] = radial_eval_alpha*x38 + x11*x7; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = x*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 1*npts] = y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 2*npts] = z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 3*npts] = x*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 4*npts] = x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 5*npts] = x*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 6*npts] = y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 7*npts] = y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 8*npts] = y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 9*npts] = z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 0*npts] = x42; + basis_xx_eval[ipt + 1*npts] = x46*y; + basis_xx_eval[ipt + 2*npts] = x46*z; + basis_xx_eval[ipt + 3*npts] = x*(x48 + x49); + basis_xx_eval[ipt + 4*npts] = x50*(x40 + x51); + basis_xx_eval[ipt + 5*npts] = x*(x52 + x53); + basis_xx_eval[ipt + 6*npts] = x54; + basis_xx_eval[ipt + 7*npts] = x49*z; + basis_xx_eval[ipt + 8*npts] = x53*y; + basis_xx_eval[ipt + 9*npts] = x55; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 1*npts] = x*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 2*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 3*npts] = y*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 4*npts] = z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 5*npts] = y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 6*npts] = x*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 7*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 8*npts] = x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 9*npts] = radial_eval_alpha_squared*x*y*z*z*z; + basis_xy_eval[ipt + 0*npts] = x56*y; + basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x5 + x12 + x5*x57; + basis_xy_eval[ipt + 2*npts] = x58; + basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x2*x8 + x2*x59 + x28; + basis_xy_eval[ipt + 4*npts] = z*(x18 + x61); + basis_xy_eval[ipt + 5*npts] = x63*y; + basis_xy_eval[ipt + 6*npts] = x*x64; + basis_xy_eval[ipt + 7*npts] = x65; + basis_xy_eval[ipt + 8*npts] = x*x67; + basis_xy_eval[ipt + 9*npts] = radial_eval_alpha_squared*x35*x9; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 1*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 2*npts] = x*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 3*npts] = y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 4*npts] = y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 5*npts] = z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 6*npts] = radial_eval_alpha_squared*x*y*y*y*z; - basis_xz_eval[ipt + 7*npts] = x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 8*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 9*npts] = x*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 0*npts] = x56*z; + basis_xz_eval[ipt + 1*npts] = x58; + basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x7 + x12 + x57*x7; + basis_xz_eval[ipt + 3*npts] = x61*z; + basis_xz_eval[ipt + 4*npts] = y*(x18 + x63); + basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x2*x9 + x2*x68 + x37; + basis_xz_eval[ipt + 6*npts] = radial_eval_alpha_squared*x6*x8; + basis_xz_eval[ipt + 7*npts] = x*(x29 + x66); + basis_xz_eval[ipt + 8*npts] = x69; + basis_xz_eval[ipt + 9*npts] = x*x70; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 1*npts] = x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 3*npts] = x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 4*npts] = x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 5*npts] = x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 6*npts] = y*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 7*npts] = z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 8*npts] = y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 9*npts] = z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 0*npts] = x73; + basis_yy_eval[ipt + 1*npts] = y*(x74 + x75); + basis_yy_eval[ipt + 2*npts] = x75*z; + basis_yy_eval[ipt + 3*npts] = x*x77; + basis_yy_eval[ipt + 4*npts] = x50*(x51 + x71); + basis_yy_eval[ipt + 5*npts] = x*x78; + basis_yy_eval[ipt + 6*npts] = x79; + basis_yy_eval[ipt + 7*npts] = x77*z; + basis_yy_eval[ipt + 8*npts] = y*(x52 + x78); + basis_yy_eval[ipt + 9*npts] = x80; // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*x*y*z; - basis_yz_eval[ipt + 1*npts] = x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 2*npts] = x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 3*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 4*npts] = x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 5*npts] = x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 6*npts] = y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 7*npts] = y*(2*radial_eval + radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 8*npts] = z*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 9*npts] = y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x16; + basis_yz_eval[ipt + 1*npts] = z*(x17 + x60); + basis_yz_eval[ipt + 2*npts] = y*(x17 + x62); + basis_yz_eval[ipt + 3*npts] = x65; + basis_yz_eval[ipt + 4*npts] = x*(x30 + x67); + basis_yz_eval[ipt + 5*npts] = x69; + basis_yz_eval[ipt + 6*npts] = x64*z; + basis_yz_eval[ipt + 7*npts] = radial_eval_alpha_squared*x7*x8 + x28 + x59*x7; + basis_yz_eval[ipt + 8*npts] = radial_eval_alpha_squared*x5*x9 + x37 + x5*x68; + basis_yz_eval[ipt + 9*npts] = x70*y; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 1*npts] = x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 3*npts] = x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 4*npts] = x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 5*npts] = x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 6*npts] = y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 7*npts] = y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 8*npts] = y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 9*npts] = z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 0*npts] = x83; + basis_zz_eval[ipt + 1*npts] = x84*y; + basis_zz_eval[ipt + 2*npts] = z*(x74 + x84); + basis_zz_eval[ipt + 3*npts] = x*x85; + basis_zz_eval[ipt + 4*npts] = x50*(x51 + x81); + basis_zz_eval[ipt + 5*npts] = x*x87; + basis_zz_eval[ipt + 6*npts] = x88; + basis_zz_eval[ipt + 7*npts] = z*(x48 + x85); + basis_zz_eval[ipt + 8*npts] = x87*y; + basis_zz_eval[ipt + 9*npts] = x89; + + @@ -246,26 +345,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x; - ang_eval_1 = radial_eval*x*x*y; - ang_eval_2 = radial_eval*x*x*z; - ang_eval_3 = radial_eval*x*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = x4*x5; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z; - ang_eval_1 = radial_eval*x*z*z; - ang_eval_2 = radial_eval*y*y*y; - ang_eval_3 = radial_eval*y*y*z; + ang_eval_0 = x1*x6; + ang_eval_1 = x4*x7; + ang_eval_2 = radial_eval*x8; + ang_eval_3 = x3*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z*z; - ang_eval_1 = radial_eval*z*z*z; + ang_eval_0 = x1*x7; + ang_eval_1 = radial_eval*x9; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; @@ -275,18 +374,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*x*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*x*z; - dang_eval_x_1 = x*y*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*x*y*z; - dang_eval_x_2 = x*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*x*y*z; - dang_eval_z_2 = x*x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = y*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*x*y*y*z; + dang_eval_x_0 = radial_eval_alpha*x10 + x11*x2; + dang_eval_y_0 = x0*x25; + dang_eval_z_0 = x0*x34; + dang_eval_x_1 = x12*y; + dang_eval_y_1 = x15 + x26; + dang_eval_z_1 = x27; + dang_eval_x_2 = x12*z; + dang_eval_y_2 = x27; + dang_eval_z_2 = x21 + x26; + dang_eval_x_3 = x13 + x15; + dang_eval_y_3 = x*x28; + dang_eval_z_3 = x23; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -300,18 +399,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*y*z*z; - dang_eval_z_1 = x*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*y*y*y; - dang_eval_y_2 = y*y*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*y*y*y*z; - dang_eval_x_3 = radial_eval_alpha*x*y*y*z; - dang_eval_y_3 = y*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x16*x18; + dang_eval_y_0 = x30*x6; + dang_eval_z_0 = x35*(radial_eval + x36); + dang_eval_x_1 = x19 + x21; + dang_eval_y_1 = x24; + dang_eval_z_1 = x*x37; + dang_eval_x_2 = x22*x8; + dang_eval_y_2 = radial_eval_alpha*x31 + x11*x5; + dang_eval_z_2 = x34*x8; + dang_eval_x_3 = x23; + dang_eval_y_3 = x28*z; + dang_eval_z_3 = x13 + x33; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -325,12 +424,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*z*z; - dang_eval_y_0 = z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*z*z*z; - dang_eval_y_1 = radial_eval_alpha*y*z*z*z; - dang_eval_z_1 = z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x24; + dang_eval_y_0 = x19 + x33; + dang_eval_z_0 = x37*y; + dang_eval_x_1 = x22*x9; + dang_eval_y_1 = x25*x9; + dang_eval_z_1 = radial_eval_alpha*x38 + x11*x7; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp new file mode 100644 index 00000000..ebeee17b --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp @@ -0,0 +1,565 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_3( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x; + const auto x3 = x2; + const auto x4 = radial_eval*z; + const auto x5 = radial_eval*x; + const auto x6 = y*y; + const auto x7 = x6; + const auto x8 = x*z; + const auto x9 = z*z; + const auto x10 = x9; + const auto x11 = y*y*y; + const auto x12 = z*z*z; + const auto x13 = x*x*x*x; + const auto x14 = 3.0*radial_eval; + const auto x15 = radial_eval_alpha*x0 + 2.0*x5; + const auto x16 = radial_eval*x7; + const auto x17 = x3*x7; + const auto x18 = radial_eval_alpha*x17; + const auto x19 = y*z; + const auto x20 = radial_eval_alpha*x3; + const auto x21 = radial_eval + x20; + const auto x22 = radial_eval*x10; + const auto x23 = x10*x3; + const auto x24 = radial_eval_alpha*x23; + const auto x25 = radial_eval_alpha*x; + const auto x26 = x25*x7*z; + const auto x27 = x10*x25*y; + const auto x28 = radial_eval_alpha*y; + const auto x29 = radial_eval*x3; + const auto x30 = radial_eval_alpha*x19*x3; + const auto x31 = radial_eval_alpha*x11 + 2.0*x1; + const auto x32 = radial_eval_alpha*x7; + const auto x33 = radial_eval + x32; + const auto x34 = y*y*y*y; + const auto x35 = x10*x7; + const auto x36 = radial_eval_alpha*x35; + const auto x37 = radial_eval_alpha*z; + const auto x38 = x*y; + const auto x39 = radial_eval_alpha*x10; + const auto x40 = radial_eval_alpha*x12 + 2.0*x4; + const auto x41 = z*z*z*z; + const auto x42 = 6.0*radial_eval_alpha; + const auto x43 = radial_eval_alpha_squared*x3; + const auto x44 = radial_eval_alpha + x43; + const auto x45 = x0*x42 + x0*x44 + 6.0*x5; + const auto x46 = 4.0*radial_eval_alpha; + const auto x47 = 2.0*radial_eval; + const auto x48 = x3*x44; + const auto x49 = x47 + x48; + const auto x50 = x3*x46 + x49; + const auto x51 = 2.0*radial_eval_alpha; + const auto x52 = x51*x7; + const auto x53 = x44*x7; + const auto x54 = x*x19; + const auto x55 = 3.0*radial_eval_alpha; + const auto x56 = x10*x51; + const auto x57 = x10*x44; + const auto x58 = x11*x44; + const auto x59 = x12*x44; + const auto x60 = radial_eval_alpha_squared*x13 + x3*x55; + const auto x61 = 2.0*x25; + const auto x62 = x19*(radial_eval_alpha_squared*x0 + x61); + const auto x63 = 2.0*x28; + const auto x64 = radial_eval_alpha_squared*x17; + const auto x65 = x32 + x64; + const auto x66 = radial_eval_alpha_squared*x23; + const auto x67 = x39 + x66; + const auto x68 = radial_eval_alpha_squared*x34 + x55*x7; + const auto x69 = x8*(radial_eval_alpha_squared*x11 + x63); + const auto x70 = radial_eval_alpha_squared*x35; + const auto x71 = x39 + x70; + const auto x72 = 2.0*x37; + const auto x73 = x38*(radial_eval_alpha_squared*x12 + x72); + const auto x74 = radial_eval_alpha_squared*x41 + x10*x55; + const auto x75 = radial_eval_alpha_squared*x7; + const auto x76 = radial_eval_alpha + x75; + const auto x77 = x0*x76; + const auto x78 = x3*x51; + const auto x79 = x3*x76; + const auto x80 = x7*x76; + const auto x81 = x47 + x80; + const auto x82 = x46*x7 + x81; + const auto x83 = x10*x76; + const auto x84 = 6.0*x1 + x11*x42 + x11*x76; + const auto x85 = x12*x76; + const auto x86 = radial_eval_alpha_squared*x10; + const auto x87 = radial_eval_alpha + x86; + const auto x88 = x0*x87; + const auto x89 = x3*x87; + const auto x90 = x7*x87; + const auto x91 = x10*x87; + const auto x92 = x47 + x91; + const auto x93 = x10*x46 + x92; + const auto x94 = x11*x87; + const auto x95 = x12*x42 + x12*x87 + 6.0*x4; + const auto x96 = x3*x42 + x49 + x79 + x89; + const auto x97 = x42*x7 + x53 + x81 + x90; + const auto x98 = x75 + x86; + const auto x99 = x10*x42 + x57 + x83 + x92; + const auto x100 = 6.0*radial_eval; + const auto x101 = 18.0*radial_eval_alpha; + const auto x102 = 3.0*x79; + const auto x103 = 3.0*x89; + const auto x104 = radial_eval_alpha_cubed*x7 + radial_eval_alpha_squared; + const auto x105 = x0*x104; + const auto x106 = radial_eval_alpha_cubed*x10 + radial_eval_alpha_squared; + const auto x107 = x0*x106; + const auto x108 = 3.0*radial_eval_alpha_squared; + const auto x109 = radial_eval_alpha_cubed*x0 + x*x108; + const auto x110 = 2.0*radial_eval_alpha_squared; + const auto x111 = 6.0*x; + const auto x112 = 2.0*x; + const auto x113 = x104*x3; + const auto x114 = x106*x3; + const auto x115 = x*x113 + x*x114 + x0*x110 + x109*x3 + x111*x44 + x112*x76 + x112*x87 + 10.0*x25; + const auto x116 = 3.0*x53; + const auto x117 = x109*x7; + const auto x118 = x104*x7; + const auto x119 = x106*x7; + const auto x120 = 4.0*radial_eval_alpha_squared; + const auto x121 = x120*x17; + const auto x122 = 3.0*x57; + const auto x123 = x10*x109; + const auto x124 = x10*x104; + const auto x125 = x10*x106; + const auto x126 = x120*x23; + const auto x127 = 6.0*y; + const auto x128 = x127*x25; + const auto x129 = radial_eval_alpha_squared*x111; + const auto x130 = x104*x11; + const auto x131 = x106*x11; + const auto x132 = 6.0*z; + const auto x133 = x132*x25; + const auto x134 = x104*x12; + const auto x135 = x106*x12; + const auto x136 = radial_eval_alpha_squared*x127; + const auto x137 = radial_eval_alpha_cubed*x3 + radial_eval_alpha_squared; + const auto x138 = x0*x137; + const auto x139 = radial_eval_alpha_cubed*x11 + x108*y; + const auto x140 = x139*x3; + const auto x141 = x137*x3; + const auto x142 = 2.0*y; + const auto x143 = x137*x7; + const auto x144 = x11*x110 + x119*y + x127*x76 + x139*x7 + x142*x44 + x142*x87 + x143*y + 10.0*x28; + const auto x145 = x42 + x43; + const auto x146 = x10*x137; + const auto x147 = x10*x139; + const auto x148 = 3.0*x90; + const auto x149 = x11*x137; + const auto x150 = 3.0*x83; + const auto x151 = x120*x35; + const auto x152 = x19*x42; + const auto x153 = x12*x137; + const auto x154 = radial_eval_alpha_squared*x132; + const auto x155 = radial_eval_alpha_cubed*x12 + x108*z; + const auto x156 = x155*x3; + const auto x157 = x155*x7; + const auto x158 = 2.0*z; + const auto x159 = x10*x155 + x110*x12 + x124*z + x132*x87 + x146*z + x158*x44 + x158*x76 + 10.0*x37; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x3; + basis_eval[ipt + 2*npts] = x3*x4; + basis_eval[ipt + 3*npts] = x5*x7; + basis_eval[ipt + 4*npts] = x1*x8; + basis_eval[ipt + 5*npts] = x10*x5; + basis_eval[ipt + 6*npts] = radial_eval*x11; + basis_eval[ipt + 7*npts] = x4*x7; + basis_eval[ipt + 8*npts] = x1*x10; + basis_eval[ipt + 9*npts] = radial_eval*x12; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x13 + x14*x3; + basis_x_eval[ipt + 1*npts] = x15*y; + basis_x_eval[ipt + 2*npts] = x15*z; + basis_x_eval[ipt + 3*npts] = x16 + x18; + basis_x_eval[ipt + 4*npts] = x19*x21; + basis_x_eval[ipt + 5*npts] = x22 + x24; + basis_x_eval[ipt + 6*npts] = x11*x25; + basis_x_eval[ipt + 7*npts] = x26; + basis_x_eval[ipt + 8*npts] = x27; + basis_x_eval[ipt + 9*npts] = x12*x25; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x0*x28; + basis_y_eval[ipt + 1*npts] = x18 + x29; + basis_y_eval[ipt + 2*npts] = x30; + basis_y_eval[ipt + 3*npts] = x*x31; + basis_y_eval[ipt + 4*npts] = x33*x8; + basis_y_eval[ipt + 5*npts] = x27; + basis_y_eval[ipt + 6*npts] = radial_eval_alpha*x34 + x14*x7; + basis_y_eval[ipt + 7*npts] = x31*z; + basis_y_eval[ipt + 8*npts] = x22 + x36; + basis_y_eval[ipt + 9*npts] = x12*x28; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x0*x37; + basis_z_eval[ipt + 1*npts] = x30; + basis_z_eval[ipt + 2*npts] = x24 + x29; + basis_z_eval[ipt + 3*npts] = x26; + basis_z_eval[ipt + 4*npts] = x38*(radial_eval + x39); + basis_z_eval[ipt + 5*npts] = x*x40; + basis_z_eval[ipt + 6*npts] = x11*x37; + basis_z_eval[ipt + 7*npts] = x16 + x36; + basis_z_eval[ipt + 8*npts] = x40*y; + basis_z_eval[ipt + 9*npts] = radial_eval_alpha*x41 + x10*x14; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x45; + basis_xx_eval[ipt + 1*npts] = x50*y; + basis_xx_eval[ipt + 2*npts] = x50*z; + basis_xx_eval[ipt + 3*npts] = x*(x52 + x53); + basis_xx_eval[ipt + 4*npts] = x54*(x43 + x55); + basis_xx_eval[ipt + 5*npts] = x*(x56 + x57); + basis_xx_eval[ipt + 6*npts] = x58; + basis_xx_eval[ipt + 7*npts] = x53*z; + basis_xx_eval[ipt + 8*npts] = x57*y; + basis_xx_eval[ipt + 9*npts] = x59; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x60*y; + basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x7 + x15 + x61*x7; + basis_xy_eval[ipt + 2*npts] = x62; + basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x11*x3 + x3*x63 + x31; + basis_xy_eval[ipt + 4*npts] = z*(x21 + x65); + basis_xy_eval[ipt + 5*npts] = x67*y; + basis_xy_eval[ipt + 6*npts] = x*x68; + basis_xy_eval[ipt + 7*npts] = x69; + basis_xy_eval[ipt + 8*npts] = x*x71; + basis_xy_eval[ipt + 9*npts] = radial_eval_alpha_squared*x12*x38; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x60*z; + basis_xz_eval[ipt + 1*npts] = x62; + basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x10 + x10*x61 + x15; + basis_xz_eval[ipt + 3*npts] = x65*z; + basis_xz_eval[ipt + 4*npts] = y*(x21 + x67); + basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x12*x3 + x3*x72 + x40; + basis_xz_eval[ipt + 6*npts] = radial_eval_alpha_squared*x11*x8; + basis_xz_eval[ipt + 7*npts] = x*(x32 + x70); + basis_xz_eval[ipt + 8*npts] = x73; + basis_xz_eval[ipt + 9*npts] = x*x74; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x77; + basis_yy_eval[ipt + 1*npts] = y*(x78 + x79); + basis_yy_eval[ipt + 2*npts] = x79*z; + basis_yy_eval[ipt + 3*npts] = x*x82; + basis_yy_eval[ipt + 4*npts] = x54*(x55 + x75); + basis_yy_eval[ipt + 5*npts] = x*x83; + basis_yy_eval[ipt + 6*npts] = x84; + basis_yy_eval[ipt + 7*npts] = x82*z; + basis_yy_eval[ipt + 8*npts] = y*(x56 + x83); + basis_yy_eval[ipt + 9*npts] = x85; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x19; + basis_yz_eval[ipt + 1*npts] = z*(x20 + x64); + basis_yz_eval[ipt + 2*npts] = y*(x20 + x66); + basis_yz_eval[ipt + 3*npts] = x69; + basis_yz_eval[ipt + 4*npts] = x*(x33 + x71); + basis_yz_eval[ipt + 5*npts] = x73; + basis_yz_eval[ipt + 6*npts] = x68*z; + basis_yz_eval[ipt + 7*npts] = radial_eval_alpha_squared*x10*x11 + x10*x63 + x31; + basis_yz_eval[ipt + 8*npts] = radial_eval_alpha_squared*x12*x7 + x40 + x7*x72; + basis_yz_eval[ipt + 9*npts] = x74*y; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x88; + basis_zz_eval[ipt + 1*npts] = x89*y; + basis_zz_eval[ipt + 2*npts] = z*(x78 + x89); + basis_zz_eval[ipt + 3*npts] = x*x90; + basis_zz_eval[ipt + 4*npts] = x54*(x55 + x86); + basis_zz_eval[ipt + 5*npts] = x*x93; + basis_zz_eval[ipt + 6*npts] = x94; + basis_zz_eval[ipt + 7*npts] = z*(x52 + x90); + basis_zz_eval[ipt + 8*npts] = x93*y; + basis_zz_eval[ipt + 9*npts] = x95; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x45 + x77 + x88; + basis_lapl_eval[ipt + 1*npts] = x96*y; + basis_lapl_eval[ipt + 2*npts] = x96*z; + basis_lapl_eval[ipt + 3*npts] = x*x97; + basis_lapl_eval[ipt + 4*npts] = x54*(9.0*radial_eval_alpha + x43 + x98); + basis_lapl_eval[ipt + 5*npts] = x*x99; + basis_lapl_eval[ipt + 6*npts] = x58 + x84 + x94; + basis_lapl_eval[ipt + 7*npts] = x97*z; + basis_lapl_eval[ipt + 8*npts] = x99*y; + basis_lapl_eval[ipt + 9*npts] = x59 + x85 + x95; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x*x105 + x*x107 + x0*x109 + x100 + x101*x3 + x102 + x103 + 9.0*x48; + basis_lapl_x_eval[ipt + 1*npts] = x115*y; + basis_lapl_x_eval[ipt + 2*npts] = x115*z; + basis_lapl_x_eval[ipt + 3*npts] = x*x117 + x116 + x118*x2 + x119*x2 + x121 + x78 + x82 + x90; + basis_lapl_x_eval[ipt + 4*npts] = x19*(x*x109 + x104*x2 + x106*x2 + x120*x3 + x42 + 3.0*x44 + x98); + basis_lapl_x_eval[ipt + 5*npts] = x*x123 + x122 + x124*x2 + x125*x2 + x126 + x78 + x83 + x93; + basis_lapl_x_eval[ipt + 6*npts] = x*x130 + x*x131 + x109*x11 + x11*x129 + x128; + basis_lapl_x_eval[ipt + 7*npts] = z*(x*x118 + x*x119 + x117 + x129*x7 + x61); + basis_lapl_x_eval[ipt + 8*npts] = y*(x*x124 + x*x125 + x10*x129 + x123 + x61); + basis_lapl_x_eval[ipt + 9*npts] = x*x134 + x*x135 + x109*x12 + x12*x129 + x133; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x0*x136 + x0*x139 + x107*y + x128 + x138*y; + basis_lapl_y_eval[ipt + 1*npts] = x102 + x114*x6 + x121 + x140*y + x141*x6 + x50 + x52 + x89; + basis_lapl_y_eval[ipt + 2*npts] = z*(x114*y + x136*x3 + x140 + x141*y + x63); + basis_lapl_y_eval[ipt + 3*npts] = x*x144; + basis_lapl_y_eval[ipt + 4*npts] = x8*(x106*x6 + x120*x7 + x137*x6 + x139*y + x145 + 3.0*x76 + x86); + basis_lapl_y_eval[ipt + 5*npts] = x*(x10*x136 + x125*y + x146*y + x147 + x63); + basis_lapl_y_eval[ipt + 6*npts] = x100 + x101*x7 + x11*x139 + x116 + x131*y + x148 + x149*y + 9.0*x80; + basis_lapl_y_eval[ipt + 7*npts] = x144*z; + basis_lapl_y_eval[ipt + 8*npts] = x125*x6 + x146*x6 + x147*y + x150 + x151 + x52 + x57 + x93; + basis_lapl_y_eval[ipt + 9*npts] = x12*x136 + x12*x139 + x135*y + x152 + x153*y; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x0*x154 + x0*x155 + x105*z + x133 + x138*z; + basis_lapl_z_eval[ipt + 1*npts] = y*(x113*z + x141*z + x154*x3 + x156 + x72); + basis_lapl_z_eval[ipt + 2*npts] = x103 + x113*x9 + x126 + x141*x9 + x156*z + x50 + x56 + x79; + basis_lapl_z_eval[ipt + 3*npts] = x*(x118*z + x143*z + x154*x7 + x157 + x72); + basis_lapl_z_eval[ipt + 4*npts] = x38*(x10*x120 + x104*x9 + x137*x9 + x145 + x155*z + x75 + 3.0*x87); + basis_lapl_z_eval[ipt + 5*npts] = x*x159; + basis_lapl_z_eval[ipt + 6*npts] = x11*x154 + x11*x155 + x130*z + x149*z + x152; + basis_lapl_z_eval[ipt + 7*npts] = x118*x9 + x143*x9 + x148 + x151 + x157*z + x53 + x56 + x82; + basis_lapl_z_eval[ipt + 8*npts] = x159*y; + basis_lapl_z_eval[ipt + 9*npts] = x10*x101 + x100 + x12*x155 + x122 + x134*z + x150 + x153*z + 9.0*x91; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x3; + ang_eval_2 = x3*x4; + ang_eval_3 = x5*x7; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = x1*x8; + ang_eval_1 = x10*x5; + ang_eval_2 = radial_eval*x11; + ang_eval_3 = x4*x7; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = x1*x10; + ang_eval_1 = radial_eval*x12; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x13 + x14*x3; + dang_eval_y_0 = x0*x28; + dang_eval_z_0 = x0*x37; + dang_eval_x_1 = x15*y; + dang_eval_y_1 = x18 + x29; + dang_eval_z_1 = x30; + dang_eval_x_2 = x15*z; + dang_eval_y_2 = x30; + dang_eval_z_2 = x24 + x29; + dang_eval_x_3 = x16 + x18; + dang_eval_y_3 = x*x31; + dang_eval_z_3 = x26; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x19*x21; + dang_eval_y_0 = x33*x8; + dang_eval_z_0 = x38*(radial_eval + x39); + dang_eval_x_1 = x22 + x24; + dang_eval_y_1 = x27; + dang_eval_z_1 = x*x40; + dang_eval_x_2 = x11*x25; + dang_eval_y_2 = radial_eval_alpha*x34 + x14*x7; + dang_eval_z_2 = x11*x37; + dang_eval_x_3 = x26; + dang_eval_y_3 = x31*z; + dang_eval_z_3 = x16 + x36; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = x27; + dang_eval_y_0 = x22 + x36; + dang_eval_z_0 = x40*y; + dang_eval_x_1 = x12*x25; + dang_eval_y_1 = x12*x28; + dang_eval_z_1 = radial_eval_alpha*x41 + x10*x14; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + basis_x_eval[ipt + 9*npts] = dang_eval_x_1; + basis_y_eval[ipt + 9*npts] = dang_eval_y_1; + basis_z_eval[ipt + 9*npts] = dang_eval_z_1; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp index 4811a3fd..2ef57f33 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l3_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_3( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_3( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,69 +106,165 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x; + const auto x3 = radial_eval*z; + const auto x4 = radial_eval*x; + const auto x5 = y*y; + const auto x6 = x*z; + const auto x7 = z*z; + const auto x8 = y*y*y; + const auto x9 = z*z*z; + const auto x10 = x*x*x*x; + const auto x11 = 3.0*radial_eval; + const auto x12 = radial_eval_alpha*x0 + 2.0*x4; + const auto x13 = radial_eval*x5; + const auto x14 = x2*x5; + const auto x15 = radial_eval_alpha*x14; + const auto x16 = y*z; + const auto x17 = radial_eval_alpha*x2; + const auto x18 = radial_eval + x17; + const auto x19 = radial_eval*x7; + const auto x20 = x2*x7; + const auto x21 = radial_eval_alpha*x20; + const auto x22 = radial_eval_alpha*x; + const auto x23 = x22*x5*z; + const auto x24 = x22*x7*y; + const auto x25 = radial_eval_alpha*y; + const auto x26 = radial_eval*x2; + const auto x27 = radial_eval_alpha*x16*x2; + const auto x28 = radial_eval_alpha*x8 + 2.0*x1; + const auto x29 = radial_eval_alpha*x5; + const auto x30 = radial_eval + x29; + const auto x31 = y*y*y*y; + const auto x32 = x5*x7; + const auto x33 = radial_eval_alpha*x32; + const auto x34 = radial_eval_alpha*z; + const auto x35 = x*y; + const auto x36 = radial_eval_alpha*x7; + const auto x37 = radial_eval_alpha*x9 + 2.0*x3; + const auto x38 = z*z*z*z; + const auto x39 = 6.0*radial_eval_alpha; + const auto x40 = radial_eval_alpha_squared*x2; + const auto x41 = radial_eval_alpha + x40; + const auto x42 = x0*x39 + x0*x41 + 6.0*x4; + const auto x43 = 4.0*radial_eval_alpha; + const auto x44 = 2.0*radial_eval; + const auto x45 = x2*x41 + x44; + const auto x46 = x2*x43 + x45; + const auto x47 = 2.0*radial_eval_alpha; + const auto x48 = x47*x5; + const auto x49 = x41*x5; + const auto x50 = x*x16; + const auto x51 = 3.0*radial_eval_alpha; + const auto x52 = x47*x7; + const auto x53 = x41*x7; + const auto x54 = x41*x8; + const auto x55 = x41*x9; + const auto x56 = radial_eval_alpha_squared*x10 + x2*x51; + const auto x57 = 2.0*x22; + const auto x58 = x16*(radial_eval_alpha_squared*x0 + x57); + const auto x59 = 2.0*x25; + const auto x60 = radial_eval_alpha_squared*x14; + const auto x61 = x29 + x60; + const auto x62 = radial_eval_alpha_squared*x20; + const auto x63 = x36 + x62; + const auto x64 = radial_eval_alpha_squared*x31 + x5*x51; + const auto x65 = x6*(radial_eval_alpha_squared*x8 + x59); + const auto x66 = radial_eval_alpha_squared*x32; + const auto x67 = x36 + x66; + const auto x68 = 2.0*x34; + const auto x69 = x35*(radial_eval_alpha_squared*x9 + x68); + const auto x70 = radial_eval_alpha_squared*x38 + x51*x7; + const auto x71 = radial_eval_alpha_squared*x5; + const auto x72 = radial_eval_alpha + x71; + const auto x73 = x0*x72; + const auto x74 = x2*x47; + const auto x75 = x2*x72; + const auto x76 = x44 + x5*x72; + const auto x77 = x43*x5 + x76; + const auto x78 = x7*x72; + const auto x79 = 6.0*x1 + x39*x8 + x72*x8; + const auto x80 = x72*x9; + const auto x81 = radial_eval_alpha_squared*x7; + const auto x82 = radial_eval_alpha + x81; + const auto x83 = x0*x82; + const auto x84 = x2*x82; + const auto x85 = x5*x82; + const auto x86 = x44 + x7*x82; + const auto x87 = x43*x7 + x86; + const auto x88 = x8*x82; + const auto x89 = 6.0*x3 + x39*x9 + x82*x9; + const auto x90 = x2*x39 + x45 + x75 + x84; + const auto x91 = x39*x5 + x49 + x76 + x85; + const auto x92 = x39*x7 + x53 + x78 + x86; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = x4*x5; + basis_eval[ipt + 4*npts] = x1*x6; + basis_eval[ipt + 5*npts] = x4*x7; + basis_eval[ipt + 6*npts] = radial_eval*x8; + basis_eval[ipt + 7*npts] = x3*x5; + basis_eval[ipt + 8*npts] = x1*x7; + basis_eval[ipt + 9*npts] = radial_eval*x9; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = y*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 5*npts] = z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y; - basis_x_eval[ipt + 7*npts] = radial_eval_alpha*x*y*y*z; - basis_x_eval[ipt + 8*npts] = radial_eval_alpha*x*y*z*z; - basis_x_eval[ipt + 9*npts] = radial_eval_alpha*x*z*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x10 + x11*x2; + basis_x_eval[ipt + 1*npts] = x12*y; + basis_x_eval[ipt + 2*npts] = x12*z; + basis_x_eval[ipt + 3*npts] = x13 + x15; + basis_x_eval[ipt + 4*npts] = x16*x18; + basis_x_eval[ipt + 5*npts] = x19 + x21; + basis_x_eval[ipt + 6*npts] = x22*x8; + basis_x_eval[ipt + 7*npts] = x23; + basis_x_eval[ipt + 8*npts] = x24; + basis_x_eval[ipt + 9*npts] = x22*x9; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*y; - basis_y_eval[ipt + 1*npts] = x*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*y*z; - basis_y_eval[ipt + 3*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*y*z*z; - basis_y_eval[ipt + 6*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 7*npts] = y*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 9*npts] = radial_eval_alpha*y*z*z*z; + basis_y_eval[ipt + 0*npts] = x0*x25; + basis_y_eval[ipt + 1*npts] = x15 + x26; + basis_y_eval[ipt + 2*npts] = x27; + basis_y_eval[ipt + 3*npts] = x*x28; + basis_y_eval[ipt + 4*npts] = x30*x6; + basis_y_eval[ipt + 5*npts] = x24; + basis_y_eval[ipt + 6*npts] = radial_eval_alpha*x31 + x11*x5; + basis_y_eval[ipt + 7*npts] = x28*z; + basis_y_eval[ipt + 8*npts] = x19 + x33; + basis_y_eval[ipt + 9*npts] = x25*x9; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*y*z; - basis_z_eval[ipt + 2*npts] = x*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*y*y*z; - basis_z_eval[ipt + 4*npts] = x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = x*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 6*npts] = radial_eval_alpha*y*y*y*z; - basis_z_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 8*npts] = y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 9*npts] = z*z*(3*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x34; + basis_z_eval[ipt + 1*npts] = x27; + basis_z_eval[ipt + 2*npts] = x21 + x26; + basis_z_eval[ipt + 3*npts] = x23; + basis_z_eval[ipt + 4*npts] = x35*(radial_eval + x36); + basis_z_eval[ipt + 5*npts] = x*x37; + basis_z_eval[ipt + 6*npts] = x34*x8; + basis_z_eval[ipt + 7*npts] = x13 + x33; + basis_z_eval[ipt + 8*npts] = x37*y; + basis_z_eval[ipt + 9*npts] = radial_eval_alpha*x38 + x11*x7; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = x*(6*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 1*npts] = y*(2*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 2*npts] = z*(2*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 3*npts] = x*(2*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 4*npts] = x*y*z*(9*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 5*npts] = x*(2*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_lapl_eval[ipt + 6*npts] = y*(6*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 7*npts] = z*(2*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 8*npts] = y*(2*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_lapl_eval[ipt + 9*npts] = z*(6*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_lapl_eval[ipt + 0*npts] = x42 + x73 + x83; + basis_lapl_eval[ipt + 1*npts] = x90*y; + basis_lapl_eval[ipt + 2*npts] = x90*z; + basis_lapl_eval[ipt + 3*npts] = x*x91; + basis_lapl_eval[ipt + 4*npts] = x50*(9.0*radial_eval_alpha + x40 + x71 + x81); + basis_lapl_eval[ipt + 5*npts] = x*x92; + basis_lapl_eval[ipt + 6*npts] = x54 + x79 + x88; + basis_lapl_eval[ipt + 7*npts] = x91*z; + basis_lapl_eval[ipt + 8*npts] = x92*y; + basis_lapl_eval[ipt + 9*npts] = x55 + x80 + x89; + @@ -181,26 +280,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x; - ang_eval_1 = radial_eval*x*x*y; - ang_eval_2 = radial_eval*x*x*z; - ang_eval_3 = radial_eval*x*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = x4*x5; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z; - ang_eval_1 = radial_eval*x*z*z; - ang_eval_2 = radial_eval*y*y*y; - ang_eval_3 = radial_eval*y*y*z; + ang_eval_0 = x1*x6; + ang_eval_1 = x4*x7; + ang_eval_2 = radial_eval*x8; + ang_eval_3 = x3*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*z*z; - ang_eval_1 = radial_eval*z*z*z; + ang_eval_0 = x1*x7; + ang_eval_1 = radial_eval*x9; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; @@ -210,18 +309,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*x*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*x*z; - dang_eval_x_1 = x*y*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*x*y*z; - dang_eval_x_2 = x*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*x*y*z; - dang_eval_z_2 = x*x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = y*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*x*y*y*z; + dang_eval_x_0 = radial_eval_alpha*x10 + x11*x2; + dang_eval_y_0 = x0*x25; + dang_eval_z_0 = x0*x34; + dang_eval_x_1 = x12*y; + dang_eval_y_1 = x15 + x26; + dang_eval_z_1 = x27; + dang_eval_x_2 = x12*z; + dang_eval_y_2 = x27; + dang_eval_z_2 = x21 + x26; + dang_eval_x_3 = x13 + x15; + dang_eval_y_3 = x*x28; + dang_eval_z_3 = x23; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -235,18 +334,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*y*z*z; - dang_eval_z_1 = x*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*y*y*y; - dang_eval_y_2 = y*y*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*y*y*y*z; - dang_eval_x_3 = radial_eval_alpha*x*y*y*z; - dang_eval_y_3 = y*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x16*x18; + dang_eval_y_0 = x30*x6; + dang_eval_z_0 = x35*(radial_eval + x36); + dang_eval_x_1 = x19 + x21; + dang_eval_y_1 = x24; + dang_eval_z_1 = x*x37; + dang_eval_x_2 = x22*x8; + dang_eval_y_2 = radial_eval_alpha*x31 + x11*x5; + dang_eval_z_2 = x34*x8; + dang_eval_x_3 = x23; + dang_eval_y_3 = x28*z; + dang_eval_z_3 = x13 + x33; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -260,12 +359,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*z*z; - dang_eval_y_0 = z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*z*z*z; - dang_eval_y_1 = radial_eval_alpha*y*z*z*z; - dang_eval_z_1 = z*z*(3*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x24; + dang_eval_y_0 = x19 + x33; + dang_eval_z_0 = x37*y; + dang_eval_x_1 = x22*x9; + dang_eval_y_1 = x25*x9; + dang_eval_z_1 = radial_eval_alpha*x38 + x11*x7; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp index 433ecd3a..65bb118b 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,24 +96,34 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + const auto x0 = radial_eval*y; + const auto x1 = x*x*x; + const auto x2 = radial_eval*z; + const auto x3 = x*x; + const auto x4 = y*y; + const auto x5 = z*z; + const auto x6 = radial_eval*x; + const auto x7 = y*y*y; + const auto x8 = z*z*z; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; - basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; - basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; - basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; - basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; - basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*(x*x*x*x); + basis_eval[ipt + 1*npts] = x0*x1; + basis_eval[ipt + 2*npts] = x1*x2; + basis_eval[ipt + 3*npts] = radial_eval*x3*x4; + basis_eval[ipt + 4*npts] = x0*x3*z; + basis_eval[ipt + 5*npts] = radial_eval*x3*x5; + basis_eval[ipt + 6*npts] = x6*x7; + basis_eval[ipt + 7*npts] = x*x2*x4; + basis_eval[ipt + 8*npts] = x*x0*x5; + basis_eval[ipt + 9*npts] = x6*x8; + basis_eval[ipt + 10*npts] = radial_eval*(y*y*y*y); + basis_eval[ipt + 11*npts] = x2*x7; + basis_eval[ipt + 12*npts] = radial_eval*x4*x5; + basis_eval[ipt + 13*npts] = x0*x8; + basis_eval[ipt + 14*npts] = radial_eval*(z*z*z*z); @@ -119,6 +132,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn @@ -130,36 +145,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x*x; - ang_eval_1 = radial_eval*x*x*x*y; - ang_eval_2 = radial_eval*x*x*x*z; - ang_eval_3 = radial_eval*x*x*y*y; + ang_eval_0 = radial_eval*(x*x*x*x); + ang_eval_1 = x0*x1; + ang_eval_2 = x1*x2; + ang_eval_3 = radial_eval*x3*x4; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*x*y*z; - ang_eval_1 = radial_eval*x*x*z*z; - ang_eval_2 = radial_eval*x*y*y*y; - ang_eval_3 = radial_eval*x*y*y*z; + ang_eval_0 = x0*x3*z; + ang_eval_1 = radial_eval*x3*x5; + ang_eval_2 = x6*x7; + ang_eval_3 = x*x2*x4; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z*z; - ang_eval_1 = radial_eval*x*z*z*z; - ang_eval_2 = radial_eval*y*y*y*y; - ang_eval_3 = radial_eval*y*y*y*z; + ang_eval_0 = x*x0*x5; + ang_eval_1 = x6*x8; + ang_eval_2 = radial_eval*(y*y*y*y); + ang_eval_3 = x2*x7; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; basis_eval[ipt + 10*npts] = ang_eval_2; basis_eval[ipt + 11*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*y*z*z; - ang_eval_1 = radial_eval*y*z*z*z; - ang_eval_2 = radial_eval*z*z*z*z; + ang_eval_0 = radial_eval*x4*x5; + ang_eval_1 = x0*x8; + ang_eval_2 = radial_eval*(z*z*z*z); basis_eval[ipt + 12*npts] = ang_eval_0; basis_eval[ipt + 13*npts] = ang_eval_1; basis_eval[ipt + 14*npts] = ang_eval_2; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp index 104fdba8..ea90a944 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_4( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_gradient_4( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,77 +102,125 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = x*x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x*x; + const auto x3 = radial_eval*z; + const auto x4 = x*x; + const auto x5 = y*y; + const auto x6 = x4*x5; + const auto x7 = z*z; + const auto x8 = x4*x7; + const auto x9 = radial_eval*x; + const auto x10 = y*y*y; + const auto x11 = z*z*z; + const auto x12 = y*y*y*y; + const auto x13 = x5*x7; + const auto x14 = z*z*z*z; + const auto x15 = 4.0*radial_eval; + const auto x16 = 3.0*radial_eval; + const auto x17 = radial_eval_alpha*x0 + x16*x4; + const auto x18 = 2.0*x9; + const auto x19 = radial_eval_alpha*x2*x5; + const auto x20 = y*z; + const auto x21 = radial_eval_alpha*x2*x7; + const auto x22 = radial_eval*x10; + const auto x23 = radial_eval_alpha*x10*x4; + const auto x24 = radial_eval*x5; + const auto x25 = radial_eval_alpha*x6; + const auto x26 = radial_eval*x7; + const auto x27 = radial_eval_alpha*x8; + const auto x28 = radial_eval*x11; + const auto x29 = radial_eval_alpha*x11*x4; + const auto x30 = radial_eval_alpha*x; + const auto x31 = x10*x30*z; + const auto x32 = x11*x30*y; + const auto x33 = radial_eval_alpha*y; + const auto x34 = radial_eval*x2; + const auto x35 = radial_eval_alpha*x2*x20; + const auto x36 = 2.0*x1; + const auto x37 = radial_eval*x4; + const auto x38 = radial_eval_alpha*x12 + x16*x5; + const auto x39 = radial_eval_alpha*x13; + const auto x40 = radial_eval_alpha*x10*x7; + const auto x41 = radial_eval_alpha*x11*x5; + const auto x42 = radial_eval_alpha*z; + const auto x43 = 2.0*x3; + const auto x44 = radial_eval_alpha*x14 + x16*x7; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; - basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; - basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; - basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; - basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; - basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = radial_eval*x6; + basis_eval[ipt + 4*npts] = x1*x4*z; + basis_eval[ipt + 5*npts] = radial_eval*x8; + basis_eval[ipt + 6*npts] = x10*x9; + basis_eval[ipt + 7*npts] = x*x3*x5; + basis_eval[ipt + 8*npts] = x*x1*x7; + basis_eval[ipt + 9*npts] = x11*x9; + basis_eval[ipt + 10*npts] = radial_eval*x12; + basis_eval[ipt + 11*npts] = x10*x3; + basis_eval[ipt + 12*npts] = radial_eval*x13; + basis_eval[ipt + 13*npts] = x1*x11; + basis_eval[ipt + 14*npts] = radial_eval*x14; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; - basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; - basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; - basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; - basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*(x*x*x*x*x) + x15*x2; + basis_x_eval[ipt + 1*npts] = x17*y; + basis_x_eval[ipt + 2*npts] = x17*z; + basis_x_eval[ipt + 3*npts] = x18*x5 + x19; + basis_x_eval[ipt + 4*npts] = x20*(radial_eval_alpha*x2 + x18); + basis_x_eval[ipt + 5*npts] = x18*x7 + x21; + basis_x_eval[ipt + 6*npts] = x22 + x23; + basis_x_eval[ipt + 7*npts] = z*(x24 + x25); + basis_x_eval[ipt + 8*npts] = y*(x26 + x27); + basis_x_eval[ipt + 9*npts] = x28 + x29; + basis_x_eval[ipt + 10*npts] = x12*x30; + basis_x_eval[ipt + 11*npts] = x31; + basis_x_eval[ipt + 12*npts] = x13*x30; + basis_x_eval[ipt + 13*npts] = x32; + basis_x_eval[ipt + 14*npts] = x14*x30; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; - basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; - basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; - basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; - basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + basis_y_eval[ipt + 0*npts] = x0*x33; + basis_y_eval[ipt + 1*npts] = x19 + x34; + basis_y_eval[ipt + 2*npts] = x35; + basis_y_eval[ipt + 3*npts] = x23 + x36*x4; + basis_y_eval[ipt + 4*npts] = z*(x25 + x37); + basis_y_eval[ipt + 5*npts] = x33*x8; + basis_y_eval[ipt + 6*npts] = x*x38; + basis_y_eval[ipt + 7*npts] = x*z*(radial_eval_alpha*x10 + x36); + basis_y_eval[ipt + 8*npts] = x*(x26 + x39); + basis_y_eval[ipt + 9*npts] = x32; + basis_y_eval[ipt + 10*npts] = radial_eval_alpha*(y*y*y*y*y) + x10*x15; + basis_y_eval[ipt + 11*npts] = x38*z; + basis_y_eval[ipt + 12*npts] = x36*x7 + x40; + basis_y_eval[ipt + 13*npts] = x28 + x41; + basis_y_eval[ipt + 14*npts] = x14*x33; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; - basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; - basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; - basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; - basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x42; + basis_z_eval[ipt + 1*npts] = x35; + basis_z_eval[ipt + 2*npts] = x21 + x34; + basis_z_eval[ipt + 3*npts] = x42*x6; + basis_z_eval[ipt + 4*npts] = y*(x27 + x37); + basis_z_eval[ipt + 5*npts] = x29 + x4*x43; + basis_z_eval[ipt + 6*npts] = x31; + basis_z_eval[ipt + 7*npts] = x*(x24 + x39); + basis_z_eval[ipt + 8*npts] = x*y*(radial_eval_alpha*x11 + x43); + basis_z_eval[ipt + 9*npts] = x*x44; + basis_z_eval[ipt + 10*npts] = x12*x42; + basis_z_eval[ipt + 11*npts] = x22 + x40; + basis_z_eval[ipt + 12*npts] = x41 + x43*x5; + basis_z_eval[ipt + 13*npts] = x44*y; + basis_z_eval[ipt + 14*npts] = radial_eval_alpha*(z*z*z*z*z) + x11*x15; + + @@ -186,36 +237,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x*x; - ang_eval_1 = radial_eval*x*x*x*y; - ang_eval_2 = radial_eval*x*x*x*z; - ang_eval_3 = radial_eval*x*x*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = radial_eval*x6; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*x*y*z; - ang_eval_1 = radial_eval*x*x*z*z; - ang_eval_2 = radial_eval*x*y*y*y; - ang_eval_3 = radial_eval*x*y*y*z; + ang_eval_0 = x1*x4*z; + ang_eval_1 = radial_eval*x8; + ang_eval_2 = x10*x9; + ang_eval_3 = x*x3*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z*z; - ang_eval_1 = radial_eval*x*z*z*z; - ang_eval_2 = radial_eval*y*y*y*y; - ang_eval_3 = radial_eval*y*y*y*z; + ang_eval_0 = x*x1*x7; + ang_eval_1 = x11*x9; + ang_eval_2 = radial_eval*x12; + ang_eval_3 = x10*x3; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; basis_eval[ipt + 10*npts] = ang_eval_2; basis_eval[ipt + 11*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*y*z*z; - ang_eval_1 = radial_eval*y*z*z*z; - ang_eval_2 = radial_eval*z*z*z*z; + ang_eval_0 = radial_eval*x13; + ang_eval_1 = x1*x11; + ang_eval_2 = radial_eval*x14; basis_eval[ipt + 12*npts] = ang_eval_0; basis_eval[ipt + 13*npts] = ang_eval_1; basis_eval[ipt + 14*npts] = ang_eval_2; @@ -226,18 +277,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; - dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; - dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; - dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + dang_eval_x_0 = radial_eval_alpha*(x*x*x*x*x) + x15*x2; + dang_eval_y_0 = x0*x33; + dang_eval_z_0 = x0*x42; + dang_eval_x_1 = x17*y; + dang_eval_y_1 = x19 + x34; + dang_eval_z_1 = x35; + dang_eval_x_2 = x17*z; + dang_eval_y_2 = x35; + dang_eval_z_2 = x21 + x34; + dang_eval_x_3 = x18*x5 + x19; + dang_eval_y_3 = x23 + x36*x4; + dang_eval_z_3 = x42*x6; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -251,18 +302,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; - dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; - dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x20*(radial_eval_alpha*x2 + x18); + dang_eval_y_0 = z*(x25 + x37); + dang_eval_z_0 = y*(x27 + x37); + dang_eval_x_1 = x18*x7 + x21; + dang_eval_y_1 = x33*x8; + dang_eval_z_1 = x29 + x4*x43; + dang_eval_x_2 = x22 + x23; + dang_eval_y_2 = x*x38; + dang_eval_z_2 = x31; + dang_eval_x_3 = z*(x24 + x25); + dang_eval_y_3 = x*z*(radial_eval_alpha*x10 + x36); + dang_eval_z_3 = x*(x24 + x39); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -276,18 +327,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; - dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; - dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; - dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; - dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = y*(x26 + x27); + dang_eval_y_0 = x*(x26 + x39); + dang_eval_z_0 = x*y*(radial_eval_alpha*x11 + x43); + dang_eval_x_1 = x28 + x29; + dang_eval_y_1 = x32; + dang_eval_z_1 = x*x44; + dang_eval_x_2 = x12*x30; + dang_eval_y_2 = radial_eval_alpha*(y*y*y*y*y) + x10*x15; + dang_eval_z_2 = x12*x42; + dang_eval_x_3 = x31; + dang_eval_y_3 = x38*z; + dang_eval_z_3 = x22 + x40; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; @@ -301,15 +352,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 11*npts] = dang_eval_y_3; basis_z_eval[ipt + 11*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; - dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; - dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; - dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; - dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x13*x30; + dang_eval_y_0 = x36*x7 + x40; + dang_eval_z_0 = x41 + x43*x5; + dang_eval_x_1 = x32; + dang_eval_y_1 = x28 + x41; + dang_eval_z_1 = x44*y; + dang_eval_x_2 = x14*x30; + dang_eval_y_2 = x14*x33; + dang_eval_z_2 = radial_eval_alpha*(z*z*z*z*z) + x11*x15; basis_x_eval[ipt + 12*npts] = dang_eval_x_0; basis_y_eval[ipt + 12*npts] = dang_eval_y_0; basis_z_eval[ipt + 12*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp index 1a9958bd..99c58bbc 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_4( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_hessian_4( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,179 +111,340 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x*x; + const auto x3 = radial_eval*z; + const auto x4 = x*x; + const auto x5 = y*y; + const auto x6 = x4*x5; + const auto x7 = x1*z; + const auto x8 = z*z; + const auto x9 = x4*x8; + const auto x10 = radial_eval*x; + const auto x11 = y*y*y; + const auto x12 = x*x3; + const auto x13 = x*x1; + const auto x14 = z*z*z; + const auto x15 = y*y*y*y; + const auto x16 = x5*x8; + const auto x17 = z*z*z*z; + const auto x18 = x*x*x*x*x; + const auto x19 = 4.0*radial_eval; + const auto x20 = 3.0*radial_eval; + const auto x21 = radial_eval_alpha*x0 + x20*x4; + const auto x22 = 2.0*x10; + const auto x23 = x2*x5; + const auto x24 = radial_eval_alpha*x23; + const auto x25 = y*z; + const auto x26 = radial_eval_alpha*x2; + const auto x27 = x22 + x26; + const auto x28 = x2*x8; + const auto x29 = radial_eval_alpha*x28; + const auto x30 = radial_eval*x11; + const auto x31 = x11*x4; + const auto x32 = radial_eval_alpha*x31; + const auto x33 = radial_eval*x5; + const auto x34 = radial_eval_alpha*x6; + const auto x35 = x33 + x34; + const auto x36 = radial_eval*x8; + const auto x37 = radial_eval_alpha*x9; + const auto x38 = x36 + x37; + const auto x39 = radial_eval*x14; + const auto x40 = x14*x4; + const auto x41 = radial_eval_alpha*x40; + const auto x42 = radial_eval_alpha*x; + const auto x43 = x11*x42*z; + const auto x44 = x14*x42*y; + const auto x45 = radial_eval_alpha*y; + const auto x46 = radial_eval*x2; + const auto x47 = radial_eval_alpha*x2*x25; + const auto x48 = 2.0*x1; + const auto x49 = radial_eval*x4; + const auto x50 = x34 + x49; + const auto x51 = radial_eval_alpha*x15 + x20*x5; + const auto x52 = x*z; + const auto x53 = radial_eval_alpha*x11; + const auto x54 = x48 + x53; + const auto x55 = radial_eval_alpha*x16; + const auto x56 = y*y*y*y*y; + const auto x57 = x11*x8; + const auto x58 = radial_eval_alpha*x57; + const auto x59 = x14*x5; + const auto x60 = radial_eval_alpha*x59; + const auto x61 = radial_eval_alpha*z; + const auto x62 = 2.0*x3; + const auto x63 = x*y; + const auto x64 = radial_eval_alpha*x14; + const auto x65 = x62 + x64; + const auto x66 = radial_eval_alpha*x17 + x20*x8; + const auto x67 = z*z*z*z*z; + const auto x68 = 12.0*radial_eval; + const auto x69 = 8.0*radial_eval_alpha; + const auto x70 = radial_eval_alpha + radial_eval_alpha_squared*x4; + const auto x71 = x0*x69 + x0*x70 + x4*x68; + const auto x72 = 6.0*radial_eval_alpha; + const auto x73 = 6.0*x10 + x2*x70; + const auto x74 = x2*x72 + x73; + const auto x75 = 4.0*radial_eval_alpha; + const auto x76 = x6*x75; + const auto x77 = 2.0*radial_eval; + const auto x78 = x5*x77; + const auto x79 = x4*x5*x70 + x78; + const auto x80 = x4*x70 + x77; + const auto x81 = x75*x9; + const auto x82 = x77*x8; + const auto x83 = x4*x70*x8 + x82; + const auto x84 = 2.0*radial_eval_alpha; + const auto x85 = x11*x84; + const auto x86 = x11*x70; + const auto x87 = x5*x84; + const auto x88 = x5*x70; + const auto x89 = x8*x84; + const auto x90 = x70*x8; + const auto x91 = x14*x84; + const auto x92 = x14*x70; + const auto x93 = x15*x70; + const auto x94 = x5*x70*x8; + const auto x95 = x17*x70; + const auto x96 = radial_eval_alpha_squared*x18 + x2*x75; + const auto x97 = 3.0*radial_eval_alpha; + const auto x98 = x6*x97; + const auto x99 = x25*(radial_eval_alpha_squared*x0 + x4*x97); + const auto x100 = 2.0*x42; + const auto x101 = 2.0*x45; + const auto x102 = radial_eval_alpha_squared*x23; + const auto x103 = x100*x5 + x102; + const auto x104 = radial_eval_alpha_squared*x28; + const auto x105 = x100*x8 + x104; + const auto x106 = radial_eval_alpha_squared*x31; + const auto x107 = x101*x4 + x106; + const auto x108 = radial_eval_alpha_squared*x4*x5*x8; + const auto x109 = x108 + x55; + const auto x110 = radial_eval_alpha_squared*x40; + const auto x111 = radial_eval_alpha_squared*x56 + x11*x75; + const auto x112 = x52*(radial_eval_alpha_squared*x15 + x5*x97); + const auto x113 = radial_eval_alpha_squared*x57; + const auto x114 = x101*x8 + x113; + const auto x115 = radial_eval_alpha_squared*x59; + const auto x116 = x9*x97; + const auto x117 = 2.0*x61; + const auto x118 = x110 + x117*x4; + const auto x119 = x115 + x117*x5; + const auto x120 = x63*(radial_eval_alpha_squared*x17 + x8*x97); + const auto x121 = radial_eval_alpha_squared*x67 + x14*x75; + const auto x122 = radial_eval_alpha + radial_eval_alpha_squared*x5; + const auto x123 = x0*x122; + const auto x124 = x2*x84; + const auto x125 = x122*x2; + const auto x126 = x4*x77; + const auto x127 = x122*x4*x5 + x126; + const auto x128 = x4*x84; + const auto x129 = x122*x4; + const auto x130 = x122*x4*x8; + const auto x131 = 6.0*x1 + x11*x122; + const auto x132 = x11*x72 + x131; + const auto x133 = x122*x5 + x77; + const auto x134 = x122*x8; + const auto x135 = x122*x14; + const auto x136 = x122*x15 + x15*x69 + x5*x68; + const auto x137 = x16*x75; + const auto x138 = x122*x5*x8 + x82; + const auto x139 = x122*x17; + const auto x140 = x16*x97; + const auto x141 = radial_eval_alpha + radial_eval_alpha_squared*x8; + const auto x142 = x0*x141; + const auto x143 = x141*x2; + const auto x144 = x141*x4*x5; + const auto x145 = x141*x4; + const auto x146 = x126 + x141*x4*x8; + const auto x147 = x11*x141; + const auto x148 = x141*x5; + const auto x149 = x141*x8 + x77; + const auto x150 = x14*x141 + 6.0*x3; + const auto x151 = x14*x72 + x150; + const auto x152 = x141*x15; + const auto x153 = x141*x5*x8 + x78; + const auto x154 = x141*x17 + x17*x69 + x68*x8; + const auto x155 = x125 + x143 + x2*x69 + x73; + const auto x156 = x11*x69 + x131 + x147 + x86; + const auto x157 = x135 + x14*x69 + x150 + x92; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; - basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; - basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; - basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; - basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; - basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = radial_eval*x6; + basis_eval[ipt + 4*npts] = x4*x7; + basis_eval[ipt + 5*npts] = radial_eval*x9; + basis_eval[ipt + 6*npts] = x10*x11; + basis_eval[ipt + 7*npts] = x12*x5; + basis_eval[ipt + 8*npts] = x13*x8; + basis_eval[ipt + 9*npts] = x10*x14; + basis_eval[ipt + 10*npts] = radial_eval*x15; + basis_eval[ipt + 11*npts] = x11*x3; + basis_eval[ipt + 12*npts] = radial_eval*x16; + basis_eval[ipt + 13*npts] = x1*x14; + basis_eval[ipt + 14*npts] = radial_eval*x17; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; - basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; - basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; - basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; - basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x18 + x19*x2; + basis_x_eval[ipt + 1*npts] = x21*y; + basis_x_eval[ipt + 2*npts] = x21*z; + basis_x_eval[ipt + 3*npts] = x22*x5 + x24; + basis_x_eval[ipt + 4*npts] = x25*x27; + basis_x_eval[ipt + 5*npts] = x22*x8 + x29; + basis_x_eval[ipt + 6*npts] = x30 + x32; + basis_x_eval[ipt + 7*npts] = x35*z; + basis_x_eval[ipt + 8*npts] = x38*y; + basis_x_eval[ipt + 9*npts] = x39 + x41; + basis_x_eval[ipt + 10*npts] = x15*x42; + basis_x_eval[ipt + 11*npts] = x43; + basis_x_eval[ipt + 12*npts] = x16*x42; + basis_x_eval[ipt + 13*npts] = x44; + basis_x_eval[ipt + 14*npts] = x17*x42; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; - basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; - basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; - basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; - basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + basis_y_eval[ipt + 0*npts] = x0*x45; + basis_y_eval[ipt + 1*npts] = x24 + x46; + basis_y_eval[ipt + 2*npts] = x47; + basis_y_eval[ipt + 3*npts] = x32 + x4*x48; + basis_y_eval[ipt + 4*npts] = x50*z; + basis_y_eval[ipt + 5*npts] = x45*x9; + basis_y_eval[ipt + 6*npts] = x*x51; + basis_y_eval[ipt + 7*npts] = x52*x54; + basis_y_eval[ipt + 8*npts] = x*(x36 + x55); + basis_y_eval[ipt + 9*npts] = x44; + basis_y_eval[ipt + 10*npts] = radial_eval_alpha*x56 + x11*x19; + basis_y_eval[ipt + 11*npts] = x51*z; + basis_y_eval[ipt + 12*npts] = x48*x8 + x58; + basis_y_eval[ipt + 13*npts] = x39 + x60; + basis_y_eval[ipt + 14*npts] = x17*x45; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; - basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; - basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; - basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; - basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x61; + basis_z_eval[ipt + 1*npts] = x47; + basis_z_eval[ipt + 2*npts] = x29 + x46; + basis_z_eval[ipt + 3*npts] = x6*x61; + basis_z_eval[ipt + 4*npts] = y*(x37 + x49); + basis_z_eval[ipt + 5*npts] = x4*x62 + x41; + basis_z_eval[ipt + 6*npts] = x43; + basis_z_eval[ipt + 7*npts] = x*(x33 + x55); + basis_z_eval[ipt + 8*npts] = x63*x65; + basis_z_eval[ipt + 9*npts] = x*x66; + basis_z_eval[ipt + 10*npts] = x15*x61; + basis_z_eval[ipt + 11*npts] = x30 + x58; + basis_z_eval[ipt + 12*npts] = x5*x62 + x60; + basis_z_eval[ipt + 13*npts] = x66*y; + basis_z_eval[ipt + 14*npts] = radial_eval_alpha*x67 + x14*x19; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = x*x*(12*radial_eval + 9*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 1*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 2*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 3*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 4*npts] = y*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 5*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_xx_eval[ipt + 6*npts] = x*y*y*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 9*npts] = x*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 11*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 12*npts] = y*y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 13*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 0*npts] = x71; + basis_xx_eval[ipt + 1*npts] = x74*y; + basis_xx_eval[ipt + 2*npts] = x74*z; + basis_xx_eval[ipt + 3*npts] = x76 + x79; + basis_xx_eval[ipt + 4*npts] = x25*(x4*x75 + x80); + basis_xx_eval[ipt + 5*npts] = x81 + x83; + basis_xx_eval[ipt + 6*npts] = x*(x85 + x86); + basis_xx_eval[ipt + 7*npts] = x52*(x87 + x88); + basis_xx_eval[ipt + 8*npts] = x63*(x89 + x90); + basis_xx_eval[ipt + 9*npts] = x*(x91 + x92); + basis_xx_eval[ipt + 10*npts] = x93; + basis_xx_eval[ipt + 11*npts] = x86*z; + basis_xx_eval[ipt + 12*npts] = x94; + basis_xx_eval[ipt + 13*npts] = x92*y; + basis_xx_eval[ipt + 14*npts] = x95; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = x*x*x*y*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 1*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 2*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 3*npts] = x*y*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 4*npts] = x*z*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 5*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 6*npts] = y*y*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 7*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 8*npts] = z*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 9*npts] = y*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 10*npts] = x*y*y*y*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 11*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 12*npts] = x*y*z*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 13*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x*y*z*z*z*z; + basis_xy_eval[ipt + 0*npts] = x96*y; + basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x5 + x21 + x98; + basis_xy_eval[ipt + 2*npts] = x99; + basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x11*x2 + x100*x11 + x101*x2 + 4.0*x13; + basis_xy_eval[ipt + 4*npts] = z*(x103 + x27); + basis_xy_eval[ipt + 5*npts] = x105*y; + basis_xy_eval[ipt + 6*npts] = radial_eval_alpha_squared*x15*x4 + x51 + x98; + basis_xy_eval[ipt + 7*npts] = z*(x107 + x54); + basis_xy_eval[ipt + 8*npts] = x109 + x38; + basis_xy_eval[ipt + 9*npts] = y*(x110 + x64); + basis_xy_eval[ipt + 10*npts] = x*x111; + basis_xy_eval[ipt + 11*npts] = x112; + basis_xy_eval[ipt + 12*npts] = x*x114; + basis_xy_eval[ipt + 13*npts] = x*(x115 + x64); + basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x17*x63; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = x*x*x*z*(4*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 1*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 2*npts] = x*x*(3*radial_eval + radial_eval_alpha*x*x + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 3*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 4*npts] = x*y*(2*radial_eval + radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 5*npts] = x*z*(4*radial_eval + 2*radial_eval_alpha*x*x + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 6*npts] = y*y*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 7*npts] = y*y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 8*npts] = y*z*(2*radial_eval + 2*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 9*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x*y*y*y*y*z; - basis_xz_eval[ipt + 11*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 12*npts] = x*y*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 13*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 14*npts] = x*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_xz_eval[ipt + 0*npts] = x96*z; + basis_xz_eval[ipt + 1*npts] = x99; + basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x8 + x116 + x21; + basis_xz_eval[ipt + 3*npts] = x103*z; + basis_xz_eval[ipt + 4*npts] = y*(x105 + x27); + basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x14*x2 + x100*x14 + x117*x2 + 4.0*x12; + basis_xz_eval[ipt + 6*npts] = z*(x106 + x53); + basis_xz_eval[ipt + 7*npts] = x109 + x35; + basis_xz_eval[ipt + 8*npts] = y*(x118 + x65); + basis_xz_eval[ipt + 9*npts] = radial_eval_alpha_squared*x17*x4 + x116 + x66; + basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x15*x52; + basis_xz_eval[ipt + 11*npts] = x*(x113 + x53); + basis_xz_eval[ipt + 12*npts] = x*x119; + basis_xz_eval[ipt + 13*npts] = x120; + basis_xz_eval[ipt + 14*npts] = x*x121; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 1*npts] = x*x*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 3*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 5*npts] = x*x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 6*npts] = x*y*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 7*npts] = x*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 8*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 9*npts] = x*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 10*npts] = y*y*(12*radial_eval + 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 11*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 12*npts] = z*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_yy_eval[ipt + 13*npts] = y*z*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 14*npts] = z*z*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 0*npts] = x123; + basis_yy_eval[ipt + 1*npts] = y*(x124 + x125); + basis_yy_eval[ipt + 2*npts] = x125*z; + basis_yy_eval[ipt + 3*npts] = x127 + x76; + basis_yy_eval[ipt + 4*npts] = x25*(x128 + x129); + basis_yy_eval[ipt + 5*npts] = x130; + basis_yy_eval[ipt + 6*npts] = x*x132; + basis_yy_eval[ipt + 7*npts] = x52*(x133 + x5*x75); + basis_yy_eval[ipt + 8*npts] = x63*(x134 + x89); + basis_yy_eval[ipt + 9*npts] = x*x135; + basis_yy_eval[ipt + 10*npts] = x136; + basis_yy_eval[ipt + 11*npts] = x132*z; + basis_yy_eval[ipt + 12*npts] = x137 + x138; + basis_yy_eval[ipt + 13*npts] = y*(x135 + x91); + basis_yy_eval[ipt + 14*npts] = x139; // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*x*x*x*y*z; - basis_yz_eval[ipt + 1*npts] = x*x*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 2*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 3*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 4*npts] = x*x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 5*npts] = x*x*y*z*(2*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 6*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 7*npts] = x*y*(2*radial_eval + radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 8*npts] = x*z*(2*radial_eval + 2*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 9*npts] = x*y*z*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 10*npts] = y*y*y*z*(4*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 11*npts] = y*y*(3*radial_eval + radial_eval_alpha*y*y + 3*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 12*npts] = y*z*(4*radial_eval + 2*radial_eval_alpha*y*y + 2*radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 13*npts] = z*z*(3*radial_eval + 3*radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 14*npts] = y*z*z*z*(4*radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x25; + basis_yz_eval[ipt + 1*npts] = z*(x102 + x26); + basis_yz_eval[ipt + 2*npts] = y*(x104 + x26); + basis_yz_eval[ipt + 3*npts] = x107*z; + basis_yz_eval[ipt + 4*npts] = x108 + x37 + x50; + basis_yz_eval[ipt + 5*npts] = x118*y; + basis_yz_eval[ipt + 6*npts] = x112; + basis_yz_eval[ipt + 7*npts] = x*(x114 + x54); + basis_yz_eval[ipt + 8*npts] = x*(x119 + x65); + basis_yz_eval[ipt + 9*npts] = x120; + basis_yz_eval[ipt + 10*npts] = x111*z; + basis_yz_eval[ipt + 11*npts] = radial_eval_alpha_squared*x15*x8 + x140 + x51; + basis_yz_eval[ipt + 12*npts] = radial_eval_alpha_squared*x11*x14 + x101*x14 + x11*x117 + 4.0*x7; + basis_yz_eval[ipt + 13*npts] = radial_eval_alpha_squared*x17*x5 + x140 + x66; + basis_yz_eval[ipt + 14*npts] = x121*y; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = x*x*x*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 1*npts] = x*x*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = x*x*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 3*npts] = x*x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 4*npts] = x*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 5*npts] = x*x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 6*npts] = x*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 7*npts] = x*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 8*npts] = x*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 9*npts] = x*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 10*npts] = y*y*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 11*npts] = y*y*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 12*npts] = y*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 13*npts] = y*z*(6*radial_eval + 7*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_zz_eval[ipt + 14*npts] = z*z*(12*radial_eval + 9*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_zz_eval[ipt + 0*npts] = x142; + basis_zz_eval[ipt + 1*npts] = x143*y; + basis_zz_eval[ipt + 2*npts] = z*(x124 + x143); + basis_zz_eval[ipt + 3*npts] = x144; + basis_zz_eval[ipt + 4*npts] = x25*(x128 + x145); + basis_zz_eval[ipt + 5*npts] = x146 + x81; + basis_zz_eval[ipt + 6*npts] = x*x147; + basis_zz_eval[ipt + 7*npts] = x52*(x148 + x87); + basis_zz_eval[ipt + 8*npts] = x63*(x149 + x75*x8); + basis_zz_eval[ipt + 9*npts] = x*x151; + basis_zz_eval[ipt + 10*npts] = x152; + basis_zz_eval[ipt + 11*npts] = z*(x147 + x85); + basis_zz_eval[ipt + 12*npts] = x137 + x153; + basis_zz_eval[ipt + 13*npts] = x151*y; + basis_zz_eval[ipt + 14*npts] = x154; + + @@ -296,36 +460,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x*x; - ang_eval_1 = radial_eval*x*x*x*y; - ang_eval_2 = radial_eval*x*x*x*z; - ang_eval_3 = radial_eval*x*x*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = radial_eval*x6; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*x*y*z; - ang_eval_1 = radial_eval*x*x*z*z; - ang_eval_2 = radial_eval*x*y*y*y; - ang_eval_3 = radial_eval*x*y*y*z; + ang_eval_0 = x4*x7; + ang_eval_1 = radial_eval*x9; + ang_eval_2 = x10*x11; + ang_eval_3 = x12*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z*z; - ang_eval_1 = radial_eval*x*z*z*z; - ang_eval_2 = radial_eval*y*y*y*y; - ang_eval_3 = radial_eval*y*y*y*z; + ang_eval_0 = x13*x8; + ang_eval_1 = x10*x14; + ang_eval_2 = radial_eval*x15; + ang_eval_3 = x11*x3; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; basis_eval[ipt + 10*npts] = ang_eval_2; basis_eval[ipt + 11*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*y*z*z; - ang_eval_1 = radial_eval*y*z*z*z; - ang_eval_2 = radial_eval*z*z*z*z; + ang_eval_0 = radial_eval*x16; + ang_eval_1 = x1*x14; + ang_eval_2 = radial_eval*x17; basis_eval[ipt + 12*npts] = ang_eval_0; basis_eval[ipt + 13*npts] = ang_eval_1; basis_eval[ipt + 14*npts] = ang_eval_2; @@ -336,18 +500,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; - dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; - dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; - dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + dang_eval_x_0 = radial_eval_alpha*x18 + x19*x2; + dang_eval_y_0 = x0*x45; + dang_eval_z_0 = x0*x61; + dang_eval_x_1 = x21*y; + dang_eval_y_1 = x24 + x46; + dang_eval_z_1 = x47; + dang_eval_x_2 = x21*z; + dang_eval_y_2 = x47; + dang_eval_z_2 = x29 + x46; + dang_eval_x_3 = x22*x5 + x24; + dang_eval_y_3 = x32 + x4*x48; + dang_eval_z_3 = x6*x61; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -361,18 +525,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; - dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; - dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x25*x27; + dang_eval_y_0 = x50*z; + dang_eval_z_0 = y*(x37 + x49); + dang_eval_x_1 = x22*x8 + x29; + dang_eval_y_1 = x45*x9; + dang_eval_z_1 = x4*x62 + x41; + dang_eval_x_2 = x30 + x32; + dang_eval_y_2 = x*x51; + dang_eval_z_2 = x43; + dang_eval_x_3 = x35*z; + dang_eval_y_3 = x52*x54; + dang_eval_z_3 = x*(x33 + x55); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -386,18 +550,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; - dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; - dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; - dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; - dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x38*y; + dang_eval_y_0 = x*(x36 + x55); + dang_eval_z_0 = x63*x65; + dang_eval_x_1 = x39 + x41; + dang_eval_y_1 = x44; + dang_eval_z_1 = x*x66; + dang_eval_x_2 = x15*x42; + dang_eval_y_2 = radial_eval_alpha*x56 + x11*x19; + dang_eval_z_2 = x15*x61; + dang_eval_x_3 = x43; + dang_eval_y_3 = x51*z; + dang_eval_z_3 = x30 + x58; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; @@ -411,15 +575,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 11*npts] = dang_eval_y_3; basis_z_eval[ipt + 11*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; - dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; - dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; - dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; - dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x16*x42; + dang_eval_y_0 = x48*x8 + x58; + dang_eval_z_0 = x5*x62 + x60; + dang_eval_x_1 = x44; + dang_eval_y_1 = x39 + x60; + dang_eval_z_1 = x66*y; + dang_eval_x_2 = x17*x42; + dang_eval_y_2 = x17*x45; + dang_eval_z_2 = radial_eval_alpha*x67 + x14*x19; basis_x_eval[ipt + 12*npts] = dang_eval_x_0; basis_y_eval[ipt + 12*npts] = dang_eval_y_0; basis_z_eval[ipt + 12*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp new file mode 100644 index 00000000..50bb788d --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp @@ -0,0 +1,789 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_lapgrad_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = x*x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x*x; + const auto x3 = radial_eval*z; + const auto x4 = x*x; + const auto x5 = x4; + const auto x6 = y*y; + const auto x7 = x6; + const auto x8 = x5*x7; + const auto x9 = x1*z; + const auto x10 = z*z; + const auto x11 = x10; + const auto x12 = x11*x5; + const auto x13 = radial_eval*x; + const auto x14 = y*y*y; + const auto x15 = x*x3; + const auto x16 = x*x1; + const auto x17 = z*z*z; + const auto x18 = y*y*y*y; + const auto x19 = x11*x7; + const auto x20 = z*z*z*z; + const auto x21 = x*x*x*x*x; + const auto x22 = 4.0*radial_eval; + const auto x23 = 3.0*radial_eval; + const auto x24 = radial_eval_alpha*x0 + x23*x5; + const auto x25 = 2.0*x13; + const auto x26 = x2*x7; + const auto x27 = radial_eval_alpha*x26; + const auto x28 = y*z; + const auto x29 = radial_eval_alpha*x2; + const auto x30 = x25 + x29; + const auto x31 = x11*x2; + const auto x32 = radial_eval_alpha*x31; + const auto x33 = radial_eval*x14; + const auto x34 = x14*x5; + const auto x35 = radial_eval_alpha*x34; + const auto x36 = radial_eval*x7; + const auto x37 = radial_eval_alpha*x8; + const auto x38 = x36 + x37; + const auto x39 = radial_eval*x11; + const auto x40 = radial_eval_alpha*x12; + const auto x41 = x39 + x40; + const auto x42 = radial_eval*x17; + const auto x43 = x17*x5; + const auto x44 = radial_eval_alpha*x43; + const auto x45 = radial_eval_alpha*x; + const auto x46 = x14*x45*z; + const auto x47 = x17*x45*y; + const auto x48 = radial_eval_alpha*y; + const auto x49 = radial_eval*x2; + const auto x50 = radial_eval_alpha*x2*x28; + const auto x51 = 2.0*x1; + const auto x52 = radial_eval*x5; + const auto x53 = x37 + x52; + const auto x54 = radial_eval_alpha*x18 + x23*x7; + const auto x55 = x*z; + const auto x56 = radial_eval_alpha*x14; + const auto x57 = x51 + x56; + const auto x58 = radial_eval_alpha*x19; + const auto x59 = y*y*y*y*y; + const auto x60 = x11*x14; + const auto x61 = radial_eval_alpha*x60; + const auto x62 = x17*x7; + const auto x63 = radial_eval_alpha*x62; + const auto x64 = radial_eval_alpha*z; + const auto x65 = 2.0*x3; + const auto x66 = x*y; + const auto x67 = radial_eval_alpha*x17; + const auto x68 = x65 + x67; + const auto x69 = radial_eval_alpha*x20 + x11*x23; + const auto x70 = z*z*z*z*z; + const auto x71 = 12.0*radial_eval; + const auto x72 = 8.0*radial_eval_alpha; + const auto x73 = radial_eval_alpha + radial_eval_alpha_squared*x5; + const auto x74 = x0*x72 + x0*x73 + x5*x71; + const auto x75 = 6.0*radial_eval_alpha; + const auto x76 = x2*x73; + const auto x77 = 6.0*x13 + x76; + const auto x78 = x2*x75 + x77; + const auto x79 = 4.0*radial_eval_alpha; + const auto x80 = x79*x8; + const auto x81 = 2.0*radial_eval; + const auto x82 = x7*x81; + const auto x83 = x5*x7*x73 + x82; + const auto x84 = x5*x73; + const auto x85 = x81 + x84; + const auto x86 = x12*x79; + const auto x87 = x11*x81; + const auto x88 = x11*x5*x73 + x87; + const auto x89 = 2.0*radial_eval_alpha; + const auto x90 = x14*x89; + const auto x91 = x14*x73; + const auto x92 = x7*x89; + const auto x93 = x7*x73; + const auto x94 = x11*x89; + const auto x95 = x11*x73; + const auto x96 = x17*x89; + const auto x97 = x17*x73; + const auto x98 = x18*x73; + const auto x99 = x11*x7*x73; + const auto x100 = x20*x73; + const auto x101 = radial_eval_alpha_squared*x21 + x2*x79; + const auto x102 = 3.0*radial_eval_alpha; + const auto x103 = x102*x8; + const auto x104 = x28*(radial_eval_alpha_squared*x0 + x102*x5); + const auto x105 = 2.0*x45; + const auto x106 = 2.0*x48; + const auto x107 = x105*x7; + const auto x108 = radial_eval_alpha_squared*x26; + const auto x109 = x107 + x108; + const auto x110 = x105*x11; + const auto x111 = radial_eval_alpha_squared*x31; + const auto x112 = x110 + x111; + const auto x113 = x106*x5; + const auto x114 = radial_eval_alpha_squared*x34; + const auto x115 = x113 + x114; + const auto x116 = radial_eval_alpha_squared*x11*x5*x7; + const auto x117 = x116 + x58; + const auto x118 = radial_eval_alpha_squared*x43; + const auto x119 = radial_eval_alpha_squared*x59 + x14*x79; + const auto x120 = x55*(radial_eval_alpha_squared*x18 + x102*x7); + const auto x121 = x106*x11; + const auto x122 = radial_eval_alpha_squared*x60; + const auto x123 = x121 + x122; + const auto x124 = radial_eval_alpha_squared*x62; + const auto x125 = x102*x12; + const auto x126 = 2.0*x64; + const auto x127 = x126*x5; + const auto x128 = x118 + x127; + const auto x129 = x126*x7; + const auto x130 = x124 + x129; + const auto x131 = x66*(radial_eval_alpha_squared*x20 + x102*x11); + const auto x132 = radial_eval_alpha_squared*x70 + x17*x79; + const auto x133 = radial_eval_alpha + radial_eval_alpha_squared*x7; + const auto x134 = x0*x133; + const auto x135 = x2*x89; + const auto x136 = x133*x2; + const auto x137 = x5*x81; + const auto x138 = x133*x5*x7 + x137; + const auto x139 = x5*x89; + const auto x140 = x133*x5; + const auto x141 = x11*x133*x5; + const auto x142 = x133*x14; + const auto x143 = 6.0*x1 + x142; + const auto x144 = x14*x75 + x143; + const auto x145 = x133*x7; + const auto x146 = x145 + x81; + const auto x147 = x11*x133; + const auto x148 = x133*x17; + const auto x149 = x133*x18 + x18*x72 + x7*x71; + const auto x150 = x19*x79; + const auto x151 = x11*x133*x7 + x87; + const auto x152 = x133*x20; + const auto x153 = x102*x19; + const auto x154 = radial_eval_alpha + radial_eval_alpha_squared*x11; + const auto x155 = x0*x154; + const auto x156 = x154*x2; + const auto x157 = x154*x5*x7; + const auto x158 = x154*x5; + const auto x159 = x11*x154*x5 + x137; + const auto x160 = x14*x154; + const auto x161 = x154*x7; + const auto x162 = x11*x154; + const auto x163 = x162 + x81; + const auto x164 = x154*x17; + const auto x165 = x164 + 6.0*x3; + const auto x166 = x165 + x17*x75; + const auto x167 = x154*x18; + const auto x168 = x11*x154*x7 + x82; + const auto x169 = x11*x71 + x154*x20 + x20*x72; + const auto x170 = x136 + x156 + x2*x72 + x77; + const auto x171 = x158 + x85; + const auto x172 = x14*x72 + x143 + x160 + x91; + const auto x173 = x146 + x161; + const auto x174 = x147 + x163; + const auto x175 = x148 + x165 + x17*x72 + x97; + const auto x176 = 36.0*radial_eval_alpha; + const auto x177 = radial_eval_alpha_cubed*x7 + radial_eval_alpha_squared; + const auto x178 = x0*x177; + const auto x179 = radial_eval_alpha_cubed*x11 + radial_eval_alpha_squared; + const auto x180 = x0*x179; + const auto x181 = radial_eval_alpha_squared*x; + const auto x182 = radial_eval_alpha_cubed*x2 + 3.0*x181; + const auto x183 = 6.0*radial_eval; + const auto x184 = 24.0*radial_eval_alpha; + const auto x185 = 2.0*radial_eval_alpha_squared; + const auto x186 = 3.0*x140; + const auto x187 = 3.0*x158; + const auto x188 = x177*x2; + const auto x189 = x179*x2; + const auto x190 = x*x188 + x*x189 + x0*x185 + x182*x2 + x183 + x184*x5 + x186 + x187 + 9.0*x84; + const auto x191 = 2.0*x; + const auto x192 = 4.0*radial_eval_alpha_squared; + const auto x193 = 6.0*x; + const auto x194 = 14.0*x45; + const auto x195 = x177*x5*x7; + const auto x196 = x179*x5*x7; + const auto x197 = 4.0*x13 + x135; + const auto x198 = x177*x5; + const auto x199 = x179*x5; + const auto x200 = x11*x177*x5; + const auto x201 = x11*x179*x5; + const auto x202 = x14*x182; + const auto x203 = x14*x177; + const auto x204 = x14*x179; + const auto x205 = 6.0*x48; + const auto x206 = 6.0*radial_eval_alpha_squared; + const auto x207 = 3.0*x93; + const auto x208 = x7*x75; + const auto x209 = x177*x7; + const auto x210 = x179*x7; + const auto x211 = x206*x8; + const auto x212 = 3.0*x95; + const auto x213 = x11*x75; + const auto x214 = x11*x177; + const auto x215 = x11*x179; + const auto x216 = x12*x206; + const auto x217 = x17*x182; + const auto x218 = x17*x177; + const auto x219 = x17*x179; + const auto x220 = 6.0*x64; + const auto x221 = 12.0*x45; + const auto x222 = 8.0*x181; + const auto x223 = x177*x18; + const auto x224 = x179*x18; + const auto x225 = 6.0*y; + const auto x226 = x225*x45; + const auto x227 = x11*x177*x7; + const auto x228 = x11*x179*x7; + const auto x229 = 6.0*z; + const auto x230 = x229*x45; + const auto x231 = x177*x20; + const auto x232 = x179*x20; + const auto x233 = 12.0*x48; + const auto x234 = radial_eval_alpha_squared*y; + const auto x235 = 8.0*x234; + const auto x236 = radial_eval_alpha_cubed*x5 + radial_eval_alpha_squared; + const auto x237 = x0*x236; + const auto x238 = radial_eval_alpha_cubed*x14 + 3.0*x234; + const auto x239 = x2*x238; + const auto x240 = x2*x236; + const auto x241 = 6.0*x45; + const auto x242 = 2.0*y; + const auto x243 = 14.0*x48; + const auto x244 = x236*x5*x7; + const auto x245 = 4.0*x1 + x90; + const auto x246 = x5*x75; + const auto x247 = x236*x5; + const auto x248 = x11*x236*x5; + const auto x249 = 3.0*x161; + const auto x250 = x14*x236; + const auto x251 = x14*x238 + 9.0*x145 + x18*x185 + x183 + x184*x7 + x204*y + x207 + x249 + x250*y; + const auto x252 = x236*x7; + const auto x253 = 3.0*x147; + const auto x254 = x11*x236; + const auto x255 = x19*x206; + const auto x256 = x28*x75; + const auto x257 = x17*x236; + const auto x258 = x17*x238; + const auto x259 = x18*x236; + const auto x260 = x11*x236*x7; + const auto x261 = x20*x236; + const auto x262 = 12.0*x64; + const auto x263 = radial_eval_alpha_squared*z; + const auto x264 = 8.0*x263; + const auto x265 = radial_eval_alpha_cubed*x17 + 3.0*x263; + const auto x266 = x2*x265; + const auto x267 = 2.0*z; + const auto x268 = 14.0*x64; + const auto x269 = 4.0*x3 + x96; + const auto x270 = x14*x265; + const auto x271 = x11*x184 + 9.0*x162 + x17*x265 + x183 + x185*x20 + x212 + x218*z + x253 + x257*z; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = radial_eval*x8; + basis_eval[ipt + 4*npts] = x5*x9; + basis_eval[ipt + 5*npts] = radial_eval*x12; + basis_eval[ipt + 6*npts] = x13*x14; + basis_eval[ipt + 7*npts] = x15*x7; + basis_eval[ipt + 8*npts] = x11*x16; + basis_eval[ipt + 9*npts] = x13*x17; + basis_eval[ipt + 10*npts] = radial_eval*x18; + basis_eval[ipt + 11*npts] = x14*x3; + basis_eval[ipt + 12*npts] = radial_eval*x19; + basis_eval[ipt + 13*npts] = x1*x17; + basis_eval[ipt + 14*npts] = radial_eval*x20; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x21 + x2*x22; + basis_x_eval[ipt + 1*npts] = x24*y; + basis_x_eval[ipt + 2*npts] = x24*z; + basis_x_eval[ipt + 3*npts] = x25*x7 + x27; + basis_x_eval[ipt + 4*npts] = x28*x30; + basis_x_eval[ipt + 5*npts] = x11*x25 + x32; + basis_x_eval[ipt + 6*npts] = x33 + x35; + basis_x_eval[ipt + 7*npts] = x38*z; + basis_x_eval[ipt + 8*npts] = x41*y; + basis_x_eval[ipt + 9*npts] = x42 + x44; + basis_x_eval[ipt + 10*npts] = x18*x45; + basis_x_eval[ipt + 11*npts] = x46; + basis_x_eval[ipt + 12*npts] = x19*x45; + basis_x_eval[ipt + 13*npts] = x47; + basis_x_eval[ipt + 14*npts] = x20*x45; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x0*x48; + basis_y_eval[ipt + 1*npts] = x27 + x49; + basis_y_eval[ipt + 2*npts] = x50; + basis_y_eval[ipt + 3*npts] = x35 + x5*x51; + basis_y_eval[ipt + 4*npts] = x53*z; + basis_y_eval[ipt + 5*npts] = x12*x48; + basis_y_eval[ipt + 6*npts] = x*x54; + basis_y_eval[ipt + 7*npts] = x55*x57; + basis_y_eval[ipt + 8*npts] = x*(x39 + x58); + basis_y_eval[ipt + 9*npts] = x47; + basis_y_eval[ipt + 10*npts] = radial_eval_alpha*x59 + x14*x22; + basis_y_eval[ipt + 11*npts] = x54*z; + basis_y_eval[ipt + 12*npts] = x11*x51 + x61; + basis_y_eval[ipt + 13*npts] = x42 + x63; + basis_y_eval[ipt + 14*npts] = x20*x48; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x0*x64; + basis_z_eval[ipt + 1*npts] = x50; + basis_z_eval[ipt + 2*npts] = x32 + x49; + basis_z_eval[ipt + 3*npts] = x64*x8; + basis_z_eval[ipt + 4*npts] = y*(x40 + x52); + basis_z_eval[ipt + 5*npts] = x44 + x5*x65; + basis_z_eval[ipt + 6*npts] = x46; + basis_z_eval[ipt + 7*npts] = x*(x36 + x58); + basis_z_eval[ipt + 8*npts] = x66*x68; + basis_z_eval[ipt + 9*npts] = x*x69; + basis_z_eval[ipt + 10*npts] = x18*x64; + basis_z_eval[ipt + 11*npts] = x33 + x61; + basis_z_eval[ipt + 12*npts] = x63 + x65*x7; + basis_z_eval[ipt + 13*npts] = x69*y; + basis_z_eval[ipt + 14*npts] = radial_eval_alpha*x70 + x17*x22; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x74; + basis_xx_eval[ipt + 1*npts] = x78*y; + basis_xx_eval[ipt + 2*npts] = x78*z; + basis_xx_eval[ipt + 3*npts] = x80 + x83; + basis_xx_eval[ipt + 4*npts] = x28*(x5*x79 + x85); + basis_xx_eval[ipt + 5*npts] = x86 + x88; + basis_xx_eval[ipt + 6*npts] = x*(x90 + x91); + basis_xx_eval[ipt + 7*npts] = x55*(x92 + x93); + basis_xx_eval[ipt + 8*npts] = x66*(x94 + x95); + basis_xx_eval[ipt + 9*npts] = x*(x96 + x97); + basis_xx_eval[ipt + 10*npts] = x98; + basis_xx_eval[ipt + 11*npts] = x91*z; + basis_xx_eval[ipt + 12*npts] = x99; + basis_xx_eval[ipt + 13*npts] = x97*y; + basis_xx_eval[ipt + 14*npts] = x100; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x101*y; + basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x0*x7 + x103 + x24; + basis_xy_eval[ipt + 2*npts] = x104; + basis_xy_eval[ipt + 3*npts] = radial_eval_alpha_squared*x14*x2 + x105*x14 + x106*x2 + 4.0*x16; + basis_xy_eval[ipt + 4*npts] = z*(x109 + x30); + basis_xy_eval[ipt + 5*npts] = x112*y; + basis_xy_eval[ipt + 6*npts] = radial_eval_alpha_squared*x18*x5 + x103 + x54; + basis_xy_eval[ipt + 7*npts] = z*(x115 + x57); + basis_xy_eval[ipt + 8*npts] = x117 + x41; + basis_xy_eval[ipt + 9*npts] = y*(x118 + x67); + basis_xy_eval[ipt + 10*npts] = x*x119; + basis_xy_eval[ipt + 11*npts] = x120; + basis_xy_eval[ipt + 12*npts] = x*x123; + basis_xy_eval[ipt + 13*npts] = x*(x124 + x67); + basis_xy_eval[ipt + 14*npts] = radial_eval_alpha_squared*x20*x66; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x101*z; + basis_xz_eval[ipt + 1*npts] = x104; + basis_xz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x0*x11 + x125 + x24; + basis_xz_eval[ipt + 3*npts] = x109*z; + basis_xz_eval[ipt + 4*npts] = y*(x112 + x30); + basis_xz_eval[ipt + 5*npts] = radial_eval_alpha_squared*x17*x2 + x105*x17 + x126*x2 + 4.0*x15; + basis_xz_eval[ipt + 6*npts] = z*(x114 + x56); + basis_xz_eval[ipt + 7*npts] = x117 + x38; + basis_xz_eval[ipt + 8*npts] = y*(x128 + x68); + basis_xz_eval[ipt + 9*npts] = radial_eval_alpha_squared*x20*x5 + x125 + x69; + basis_xz_eval[ipt + 10*npts] = radial_eval_alpha_squared*x18*x55; + basis_xz_eval[ipt + 11*npts] = x*(x122 + x56); + basis_xz_eval[ipt + 12*npts] = x*x130; + basis_xz_eval[ipt + 13*npts] = x131; + basis_xz_eval[ipt + 14*npts] = x*x132; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x134; + basis_yy_eval[ipt + 1*npts] = y*(x135 + x136); + basis_yy_eval[ipt + 2*npts] = x136*z; + basis_yy_eval[ipt + 3*npts] = x138 + x80; + basis_yy_eval[ipt + 4*npts] = x28*(x139 + x140); + basis_yy_eval[ipt + 5*npts] = x141; + basis_yy_eval[ipt + 6*npts] = x*x144; + basis_yy_eval[ipt + 7*npts] = x55*(x146 + x7*x79); + basis_yy_eval[ipt + 8*npts] = x66*(x147 + x94); + basis_yy_eval[ipt + 9*npts] = x*x148; + basis_yy_eval[ipt + 10*npts] = x149; + basis_yy_eval[ipt + 11*npts] = x144*z; + basis_yy_eval[ipt + 12*npts] = x150 + x151; + basis_yy_eval[ipt + 13*npts] = y*(x148 + x96); + basis_yy_eval[ipt + 14*npts] = x152; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x0*x28; + basis_yz_eval[ipt + 1*npts] = z*(x108 + x29); + basis_yz_eval[ipt + 2*npts] = y*(x111 + x29); + basis_yz_eval[ipt + 3*npts] = x115*z; + basis_yz_eval[ipt + 4*npts] = x116 + x40 + x53; + basis_yz_eval[ipt + 5*npts] = x128*y; + basis_yz_eval[ipt + 6*npts] = x120; + basis_yz_eval[ipt + 7*npts] = x*(x123 + x57); + basis_yz_eval[ipt + 8*npts] = x*(x130 + x68); + basis_yz_eval[ipt + 9*npts] = x131; + basis_yz_eval[ipt + 10*npts] = x119*z; + basis_yz_eval[ipt + 11*npts] = radial_eval_alpha_squared*x11*x18 + x153 + x54; + basis_yz_eval[ipt + 12*npts] = radial_eval_alpha_squared*x14*x17 + x106*x17 + x126*x14 + 4.0*x9; + basis_yz_eval[ipt + 13*npts] = radial_eval_alpha_squared*x20*x7 + x153 + x69; + basis_yz_eval[ipt + 14*npts] = x132*y; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x155; + basis_zz_eval[ipt + 1*npts] = x156*y; + basis_zz_eval[ipt + 2*npts] = z*(x135 + x156); + basis_zz_eval[ipt + 3*npts] = x157; + basis_zz_eval[ipt + 4*npts] = x28*(x139 + x158); + basis_zz_eval[ipt + 5*npts] = x159 + x86; + basis_zz_eval[ipt + 6*npts] = x*x160; + basis_zz_eval[ipt + 7*npts] = x55*(x161 + x92); + basis_zz_eval[ipt + 8*npts] = x66*(x11*x79 + x163); + basis_zz_eval[ipt + 9*npts] = x*x166; + basis_zz_eval[ipt + 10*npts] = x167; + basis_zz_eval[ipt + 11*npts] = z*(x160 + x90); + basis_zz_eval[ipt + 12*npts] = x150 + x168; + basis_zz_eval[ipt + 13*npts] = x166*y; + basis_zz_eval[ipt + 14*npts] = x169; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x134 + x155 + x74; + basis_lapl_eval[ipt + 1*npts] = x170*y; + basis_lapl_eval[ipt + 2*npts] = x170*z; + basis_lapl_eval[ipt + 3*npts] = x138 + x157 + x72*x8 + x83; + basis_lapl_eval[ipt + 4*npts] = x28*(x140 + x171 + x5*x72); + basis_lapl_eval[ipt + 5*npts] = x12*x72 + x141 + x159 + x88; + basis_lapl_eval[ipt + 6*npts] = x*x172; + basis_lapl_eval[ipt + 7*npts] = x55*(x173 + x7*x72 + x93); + basis_lapl_eval[ipt + 8*npts] = x66*(x11*x72 + x174 + x95); + basis_lapl_eval[ipt + 9*npts] = x*x175; + basis_lapl_eval[ipt + 10*npts] = x149 + x167 + x98; + basis_lapl_eval[ipt + 11*npts] = x172*z; + basis_lapl_eval[ipt + 12*npts] = x151 + x168 + x19*x72 + x99; + basis_lapl_eval[ipt + 13*npts] = x175*y; + basis_lapl_eval[ipt + 14*npts] = x100 + x152 + x169; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x*x178 + x*x180 + x0*x182 + 24.0*x13 + 4.0*x136 + 4.0*x156 + x176*x2 + 12.0*x76; + basis_lapl_x_eval[ipt + 1*npts] = x190*y; + basis_lapl_x_eval[ipt + 2*npts] = x190*z; + basis_lapl_x_eval[ipt + 3*npts] = x*x195 + x*x196 + x145*x191 + x161*x191 + x182*x5*x7 + x192*x26 + x193*x93 + x194*x7 + x197; + basis_lapl_x_eval[ipt + 4*npts] = x28*(x*x198 + x*x199 + x133*x191 + x154*x191 + x182*x5 + x192*x2 + x193*x73 + x194); + basis_lapl_x_eval[ipt + 5*npts] = x*x200 + x*x201 + x11*x182*x5 + x11*x194 + x147*x191 + x162*x191 + x192*x31 + x193*x95 + x197; + basis_lapl_x_eval[ipt + 6*npts] = x*x202 + x144 + x160 + x203*x4 + x204*x4 + x205*x5 + x206*x34 + 3.0*x91; + basis_lapl_x_eval[ipt + 7*npts] = z*(x*x182*x7 + x139 + x173 + x207 + x208 + x209*x4 + x210*x4 + x211); + basis_lapl_x_eval[ipt + 8*npts] = y*(x*x11*x182 + x139 + x174 + x212 + x213 + x214*x4 + x215*x4 + x216); + basis_lapl_x_eval[ipt + 9*npts] = x*x217 + x148 + x166 + x206*x43 + x218*x4 + x219*x4 + x220*x5 + 3.0*x97; + basis_lapl_x_eval[ipt + 10*npts] = x*x223 + x*x224 + x18*x182 + x18*x222 + x221*x7; + basis_lapl_x_eval[ipt + 11*npts] = z*(x*x203 + x*x204 + x14*x222 + x202 + x226); + basis_lapl_x_eval[ipt + 12*npts] = x*x227 + x*x228 + x107 + x11*x182*x7 + x110 + x19*x222; + basis_lapl_x_eval[ipt + 13*npts] = y*(x*x218 + x*x219 + x17*x222 + x217 + x230); + basis_lapl_x_eval[ipt + 14*npts] = x*x231 + x*x232 + x11*x221 + x182*x20 + x20*x222; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x0*x235 + x0*x238 + x180*y + x233*x5 + x237*y; + basis_lapl_y_eval[ipt + 1*npts] = 3.0*x136 + x156 + x189*x6 + x206*x26 + x239*y + x240*x6 + x241*x7 + x78; + basis_lapl_y_eval[ipt + 2*npts] = z*(x189*y + x2*x235 + x226 + x239 + x240*y); + basis_lapl_y_eval[ipt + 3*npts] = x140*x225 + x158*x242 + x192*x34 + x196*y + x238*x5*x7 + x242*x84 + x243*x5 + x244*y + x245; + basis_lapl_y_eval[ipt + 4*npts] = z*(x171 + x186 + x199*x6 + x211 + x238*x5*y + x246 + x247*x6 + x92); + basis_lapl_y_eval[ipt + 5*npts] = x11*x238*x5 + x113 + x12*x235 + x121 + x201*y + x248*y; + basis_lapl_y_eval[ipt + 6*npts] = x*x251; + basis_lapl_y_eval[ipt + 7*npts] = x55*(x133*x225 + x14*x192 + x154*x242 + x210*y + x238*x7 + x242*x73 + x243 + x252*y); + basis_lapl_y_eval[ipt + 8*npts] = x*(x11*x238*y + x163 + x213 + x215*x6 + x253 + x254*x6 + x255 + x92 + x95); + basis_lapl_y_eval[ipt + 9*npts] = x*(x17*x235 + x219*y + x256 + x257*y + x258); + basis_lapl_y_eval[ipt + 10*npts] = 24.0*x1 + x14*x176 + 12.0*x142 + 4.0*x160 + x18*x238 + x224*y + x259*y + 4.0*x91; + basis_lapl_y_eval[ipt + 11*npts] = x251*z; + basis_lapl_y_eval[ipt + 12*npts] = x11*x238*x7 + x11*x243 + x147*x225 + x162*x242 + x192*x60 + x228*y + x242*x95 + x245 + x260*y; + basis_lapl_y_eval[ipt + 13*npts] = 3.0*x148 + x166 + x206*x62 + x219*x6 + x220*x7 + x257*x6 + x258*y + x97; + basis_lapl_y_eval[ipt + 14*npts] = x11*x233 + x20*x235 + x20*x238 + x232*y + x261*y; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x0*x264 + x0*x265 + x178*z + x237*z + x262*x5; + basis_lapl_z_eval[ipt + 1*npts] = y*(x188*z + x2*x264 + x230 + x240*z + x266); + basis_lapl_z_eval[ipt + 2*npts] = x10*x188 + x10*x240 + x11*x241 + x136 + 3.0*x156 + x206*x31 + x266*z + x78; + basis_lapl_z_eval[ipt + 3*npts] = x127 + x129 + x195*z + x244*z + x264*x8 + x265*x5*x7; + basis_lapl_z_eval[ipt + 4*npts] = y*(x10*x198 + x10*x247 + x140 + x187 + x216 + x246 + x265*x5*z + x85 + x94); + basis_lapl_z_eval[ipt + 5*npts] = x11*x265*x5 + x140*x267 + x158*x229 + x192*x43 + x200*z + x248*z + x267*x84 + x268*x5 + x269; + basis_lapl_z_eval[ipt + 6*npts] = x*(x14*x264 + x203*z + x250*z + x256 + x270); + basis_lapl_z_eval[ipt + 7*npts] = x*(x10*x209 + x10*x252 + x146 + x208 + x249 + x255 + x265*x7*z + x93 + x94); + basis_lapl_z_eval[ipt + 8*npts] = x66*(x11*x265 + x133*x267 + x154*x229 + x17*x192 + x214*z + x254*z + x267*x73 + x268); + basis_lapl_z_eval[ipt + 9*npts] = x*x271; + basis_lapl_z_eval[ipt + 10*npts] = x18*x264 + x18*x265 + x223*z + x259*z + x262*x7; + basis_lapl_z_eval[ipt + 11*npts] = x10*x203 + x10*x250 + x11*x205 + x144 + 3.0*x160 + x206*x60 + x270*z + x91; + basis_lapl_z_eval[ipt + 12*npts] = x11*x265*x7 + x145*x267 + x161*x229 + x192*x62 + x227*z + x260*z + x267*x93 + x268*x7 + x269; + basis_lapl_z_eval[ipt + 13*npts] = x271*y; + basis_lapl_z_eval[ipt + 14*npts] = 4.0*x148 + 12.0*x164 + x17*x176 + x20*x265 + x231*z + x261*z + 24.0*x3 + 4.0*x97; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = radial_eval*x8; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = x5*x9; + ang_eval_1 = radial_eval*x12; + ang_eval_2 = x13*x14; + ang_eval_3 = x15*x7; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = x11*x16; + ang_eval_1 = x13*x17; + ang_eval_2 = radial_eval*x18; + ang_eval_3 = x14*x3; + basis_eval[ipt + 8*npts] = ang_eval_0; + basis_eval[ipt + 9*npts] = ang_eval_1; + basis_eval[ipt + 10*npts] = ang_eval_2; + basis_eval[ipt + 11*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x19; + ang_eval_1 = x1*x17; + ang_eval_2 = radial_eval*x20; + basis_eval[ipt + 12*npts] = ang_eval_0; + basis_eval[ipt + 13*npts] = ang_eval_1; + basis_eval[ipt + 14*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = radial_eval_alpha*x21 + x2*x22; + dang_eval_y_0 = x0*x48; + dang_eval_z_0 = x0*x64; + dang_eval_x_1 = x24*y; + dang_eval_y_1 = x27 + x49; + dang_eval_z_1 = x50; + dang_eval_x_2 = x24*z; + dang_eval_y_2 = x50; + dang_eval_z_2 = x32 + x49; + dang_eval_x_3 = x25*x7 + x27; + dang_eval_y_3 = x35 + x5*x51; + dang_eval_z_3 = x64*x8; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x28*x30; + dang_eval_y_0 = x53*z; + dang_eval_z_0 = y*(x40 + x52); + dang_eval_x_1 = x11*x25 + x32; + dang_eval_y_1 = x12*x48; + dang_eval_z_1 = x44 + x5*x65; + dang_eval_x_2 = x33 + x35; + dang_eval_y_2 = x*x54; + dang_eval_z_2 = x46; + dang_eval_x_3 = x38*z; + dang_eval_y_3 = x55*x57; + dang_eval_z_3 = x*(x36 + x58); + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = x41*y; + dang_eval_y_0 = x*(x39 + x58); + dang_eval_z_0 = x66*x68; + dang_eval_x_1 = x42 + x44; + dang_eval_y_1 = x47; + dang_eval_z_1 = x*x69; + dang_eval_x_2 = x18*x45; + dang_eval_y_2 = radial_eval_alpha*x59 + x14*x22; + dang_eval_z_2 = x18*x64; + dang_eval_x_3 = x46; + dang_eval_y_3 = x54*z; + dang_eval_z_3 = x33 + x61; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + basis_x_eval[ipt + 9*npts] = dang_eval_x_1; + basis_y_eval[ipt + 9*npts] = dang_eval_y_1; + basis_z_eval[ipt + 9*npts] = dang_eval_z_1; + basis_x_eval[ipt + 10*npts] = dang_eval_x_2; + basis_y_eval[ipt + 10*npts] = dang_eval_y_2; + basis_z_eval[ipt + 10*npts] = dang_eval_z_2; + basis_x_eval[ipt + 11*npts] = dang_eval_x_3; + basis_y_eval[ipt + 11*npts] = dang_eval_y_3; + basis_z_eval[ipt + 11*npts] = dang_eval_z_3; + + dang_eval_x_0 = x19*x45; + dang_eval_y_0 = x11*x51 + x61; + dang_eval_z_0 = x63 + x65*x7; + dang_eval_x_1 = x47; + dang_eval_y_1 = x42 + x63; + dang_eval_z_1 = x69*y; + dang_eval_x_2 = x20*x45; + dang_eval_y_2 = x20*x48; + dang_eval_z_2 = radial_eval_alpha*x70 + x17*x22; + basis_x_eval[ipt + 12*npts] = dang_eval_x_0; + basis_y_eval[ipt + 12*npts] = dang_eval_y_0; + basis_z_eval[ipt + 12*npts] = dang_eval_z_0; + basis_x_eval[ipt + 13*npts] = dang_eval_x_1; + basis_y_eval[ipt + 13*npts] = dang_eval_y_1; + basis_z_eval[ipt + 13*npts] = dang_eval_z_1; + basis_x_eval[ipt + 14*npts] = dang_eval_x_2; + basis_y_eval[ipt + 14*npts] = dang_eval_y_2; + basis_z_eval[ipt + 14*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp index cbd77a2e..52f08f34 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_4( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_cartesian_laplacian_4( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,94 +106,255 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = x*x*x*x; + const auto x1 = radial_eval*y; + const auto x2 = x*x*x; + const auto x3 = radial_eval*z; + const auto x4 = x*x; + const auto x5 = y*y; + const auto x6 = x4*x5; + const auto x7 = x1*z; + const auto x8 = z*z; + const auto x9 = x4*x8; + const auto x10 = radial_eval*x; + const auto x11 = y*y*y; + const auto x12 = x*x3; + const auto x13 = x*x1; + const auto x14 = z*z*z; + const auto x15 = y*y*y*y; + const auto x16 = x5*x8; + const auto x17 = z*z*z*z; + const auto x18 = x*x*x*x*x; + const auto x19 = 4.0*radial_eval; + const auto x20 = 3.0*radial_eval; + const auto x21 = radial_eval_alpha*x0 + x20*x4; + const auto x22 = 2.0*x10; + const auto x23 = x2*x5; + const auto x24 = radial_eval_alpha*x23; + const auto x25 = y*z; + const auto x26 = radial_eval_alpha*x2; + const auto x27 = x22 + x26; + const auto x28 = x2*x8; + const auto x29 = radial_eval_alpha*x28; + const auto x30 = radial_eval*x11; + const auto x31 = x11*x4; + const auto x32 = radial_eval_alpha*x31; + const auto x33 = radial_eval*x5; + const auto x34 = radial_eval_alpha*x6; + const auto x35 = x33 + x34; + const auto x36 = radial_eval*x8; + const auto x37 = radial_eval_alpha*x9; + const auto x38 = x36 + x37; + const auto x39 = radial_eval*x14; + const auto x40 = x14*x4; + const auto x41 = radial_eval_alpha*x40; + const auto x42 = radial_eval_alpha*x; + const auto x43 = x11*x42*z; + const auto x44 = x14*x42*y; + const auto x45 = radial_eval_alpha*y; + const auto x46 = radial_eval*x2; + const auto x47 = radial_eval_alpha*x2*x25; + const auto x48 = 2.0*x1; + const auto x49 = radial_eval*x4; + const auto x50 = x34 + x49; + const auto x51 = radial_eval_alpha*x15 + x20*x5; + const auto x52 = x*z; + const auto x53 = radial_eval_alpha*x11; + const auto x54 = x48 + x53; + const auto x55 = radial_eval_alpha*x16; + const auto x56 = y*y*y*y*y; + const auto x57 = x11*x8; + const auto x58 = radial_eval_alpha*x57; + const auto x59 = x14*x5; + const auto x60 = radial_eval_alpha*x59; + const auto x61 = radial_eval_alpha*z; + const auto x62 = 2.0*x3; + const auto x63 = x*y; + const auto x64 = radial_eval_alpha*x14; + const auto x65 = x62 + x64; + const auto x66 = radial_eval_alpha*x17 + x20*x8; + const auto x67 = z*z*z*z*z; + const auto x68 = 12.0*radial_eval; + const auto x69 = 8.0*radial_eval_alpha; + const auto x70 = radial_eval_alpha + radial_eval_alpha_squared*x4; + const auto x71 = x0*x69 + x0*x70 + x4*x68; + const auto x72 = 6.0*radial_eval_alpha; + const auto x73 = 6.0*x10 + x2*x70; + const auto x74 = x2*x72 + x73; + const auto x75 = 4.0*radial_eval_alpha; + const auto x76 = x6*x75; + const auto x77 = 2.0*radial_eval; + const auto x78 = x5*x77; + const auto x79 = x4*x5*x70 + x78; + const auto x80 = x4*x70 + x77; + const auto x81 = x75*x9; + const auto x82 = x77*x8; + const auto x83 = x4*x70*x8 + x82; + const auto x84 = 2.0*radial_eval_alpha; + const auto x85 = x11*x84; + const auto x86 = x11*x70; + const auto x87 = x5*x84; + const auto x88 = x5*x70; + const auto x89 = x8*x84; + const auto x90 = x70*x8; + const auto x91 = x14*x84; + const auto x92 = x14*x70; + const auto x93 = x15*x70; + const auto x94 = x5*x70*x8; + const auto x95 = x17*x70; + const auto x96 = radial_eval_alpha_squared*x18 + x2*x75; + const auto x97 = 3.0*radial_eval_alpha; + const auto x98 = x6*x97; + const auto x99 = x25*(radial_eval_alpha_squared*x0 + x4*x97); + const auto x100 = 2.0*x42; + const auto x101 = 2.0*x45; + const auto x102 = radial_eval_alpha_squared*x23; + const auto x103 = x100*x5 + x102; + const auto x104 = radial_eval_alpha_squared*x28; + const auto x105 = x100*x8 + x104; + const auto x106 = radial_eval_alpha_squared*x31; + const auto x107 = x101*x4 + x106; + const auto x108 = radial_eval_alpha_squared*x4*x5*x8; + const auto x109 = x108 + x55; + const auto x110 = radial_eval_alpha_squared*x40; + const auto x111 = radial_eval_alpha_squared*x56 + x11*x75; + const auto x112 = x52*(radial_eval_alpha_squared*x15 + x5*x97); + const auto x113 = radial_eval_alpha_squared*x57; + const auto x114 = x101*x8 + x113; + const auto x115 = radial_eval_alpha_squared*x59; + const auto x116 = x9*x97; + const auto x117 = 2.0*x61; + const auto x118 = x110 + x117*x4; + const auto x119 = x115 + x117*x5; + const auto x120 = x63*(radial_eval_alpha_squared*x17 + x8*x97); + const auto x121 = radial_eval_alpha_squared*x67 + x14*x75; + const auto x122 = radial_eval_alpha + radial_eval_alpha_squared*x5; + const auto x123 = x0*x122; + const auto x124 = x2*x84; + const auto x125 = x122*x2; + const auto x126 = x4*x77; + const auto x127 = x122*x4*x5 + x126; + const auto x128 = x4*x84; + const auto x129 = x122*x4; + const auto x130 = x122*x4*x8; + const auto x131 = 6.0*x1 + x11*x122; + const auto x132 = x11*x72 + x131; + const auto x133 = x122*x5 + x77; + const auto x134 = x122*x8; + const auto x135 = x122*x14; + const auto x136 = x122*x15 + x15*x69 + x5*x68; + const auto x137 = x16*x75; + const auto x138 = x122*x5*x8 + x82; + const auto x139 = x122*x17; + const auto x140 = x16*x97; + const auto x141 = radial_eval_alpha + radial_eval_alpha_squared*x8; + const auto x142 = x0*x141; + const auto x143 = x141*x2; + const auto x144 = x141*x4*x5; + const auto x145 = x141*x4; + const auto x146 = x126 + x141*x4*x8; + const auto x147 = x11*x141; + const auto x148 = x141*x5; + const auto x149 = x141*x8 + x77; + const auto x150 = x14*x141 + 6.0*x3; + const auto x151 = x14*x72 + x150; + const auto x152 = x141*x15; + const auto x153 = x141*x5*x8 + x78; + const auto x154 = x141*x17 + x17*x69 + x68*x8; + const auto x155 = x125 + x143 + x2*x69 + x73; + const auto x156 = x11*x69 + x131 + x147 + x86; + const auto x157 = x135 + x14*x69 + x150 + x92; + // Evaluate basis function - basis_eval[ipt + 0*npts] = radial_eval*x*x*x*x; - basis_eval[ipt + 1*npts] = radial_eval*x*x*x*y; - basis_eval[ipt + 2*npts] = radial_eval*x*x*x*z; - basis_eval[ipt + 3*npts] = radial_eval*x*x*y*y; - basis_eval[ipt + 4*npts] = radial_eval*x*x*y*z; - basis_eval[ipt + 5*npts] = radial_eval*x*x*z*z; - basis_eval[ipt + 6*npts] = radial_eval*x*y*y*y; - basis_eval[ipt + 7*npts] = radial_eval*x*y*y*z; - basis_eval[ipt + 8*npts] = radial_eval*x*y*z*z; - basis_eval[ipt + 9*npts] = radial_eval*x*z*z*z; - basis_eval[ipt + 10*npts] = radial_eval*y*y*y*y; - basis_eval[ipt + 11*npts] = radial_eval*y*y*y*z; - basis_eval[ipt + 12*npts] = radial_eval*y*y*z*z; - basis_eval[ipt + 13*npts] = radial_eval*y*z*z*z; - basis_eval[ipt + 14*npts] = radial_eval*z*z*z*z; + basis_eval[ipt + 0*npts] = radial_eval*x0; + basis_eval[ipt + 1*npts] = x1*x2; + basis_eval[ipt + 2*npts] = x2*x3; + basis_eval[ipt + 3*npts] = radial_eval*x6; + basis_eval[ipt + 4*npts] = x4*x7; + basis_eval[ipt + 5*npts] = radial_eval*x9; + basis_eval[ipt + 6*npts] = x10*x11; + basis_eval[ipt + 7*npts] = x12*x5; + basis_eval[ipt + 8*npts] = x13*x8; + basis_eval[ipt + 9*npts] = x10*x14; + basis_eval[ipt + 10*npts] = radial_eval*x15; + basis_eval[ipt + 11*npts] = x11*x3; + basis_eval[ipt + 12*npts] = radial_eval*x16; + basis_eval[ipt + 13*npts] = x1*x14; + basis_eval[ipt + 14*npts] = radial_eval*x17; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 3*npts] = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 5*npts] = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 6*npts] = y*y*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 7*npts] = y*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 8*npts] = y*z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 9*npts] = z*z*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 10*npts] = radial_eval_alpha*x*y*y*y*y; - basis_x_eval[ipt + 11*npts] = radial_eval_alpha*x*y*y*y*z; - basis_x_eval[ipt + 12*npts] = radial_eval_alpha*x*y*y*z*z; - basis_x_eval[ipt + 13*npts] = radial_eval_alpha*x*y*z*z*z; - basis_x_eval[ipt + 14*npts] = radial_eval_alpha*x*z*z*z*z; + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x18 + x19*x2; + basis_x_eval[ipt + 1*npts] = x21*y; + basis_x_eval[ipt + 2*npts] = x21*z; + basis_x_eval[ipt + 3*npts] = x22*x5 + x24; + basis_x_eval[ipt + 4*npts] = x25*x27; + basis_x_eval[ipt + 5*npts] = x22*x8 + x29; + basis_x_eval[ipt + 6*npts] = x30 + x32; + basis_x_eval[ipt + 7*npts] = x35*z; + basis_x_eval[ipt + 8*npts] = x38*y; + basis_x_eval[ipt + 9*npts] = x39 + x41; + basis_x_eval[ipt + 10*npts] = x15*x42; + basis_x_eval[ipt + 11*npts] = x43; + basis_x_eval[ipt + 12*npts] = x16*x42; + basis_x_eval[ipt + 13*npts] = x44; + basis_x_eval[ipt + 14*npts] = x17*x42; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*y; - basis_y_eval[ipt + 1*npts] = x*x*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*x*x*y*z; - basis_y_eval[ipt + 3*npts] = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 4*npts] = x*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 5*npts] = radial_eval_alpha*x*x*y*z*z; - basis_y_eval[ipt + 6*npts] = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 7*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 8*npts] = x*z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 9*npts] = radial_eval_alpha*x*y*z*z*z; - basis_y_eval[ipt + 10*npts] = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 11*npts] = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 12*npts] = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 13*npts] = z*z*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 14*npts] = radial_eval_alpha*y*z*z*z*z; + basis_y_eval[ipt + 0*npts] = x0*x45; + basis_y_eval[ipt + 1*npts] = x24 + x46; + basis_y_eval[ipt + 2*npts] = x47; + basis_y_eval[ipt + 3*npts] = x32 + x4*x48; + basis_y_eval[ipt + 4*npts] = x50*z; + basis_y_eval[ipt + 5*npts] = x45*x9; + basis_y_eval[ipt + 6*npts] = x*x51; + basis_y_eval[ipt + 7*npts] = x52*x54; + basis_y_eval[ipt + 8*npts] = x*(x36 + x55); + basis_y_eval[ipt + 9*npts] = x44; + basis_y_eval[ipt + 10*npts] = radial_eval_alpha*x56 + x11*x19; + basis_y_eval[ipt + 11*npts] = x51*z; + basis_y_eval[ipt + 12*npts] = x48*x8 + x58; + basis_y_eval[ipt + 13*npts] = x39 + x60; + basis_y_eval[ipt + 14*npts] = x17*x45; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*x*x*x*x*z; - basis_z_eval[ipt + 1*npts] = radial_eval_alpha*x*x*x*y*z; - basis_z_eval[ipt + 2*npts] = x*x*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 3*npts] = radial_eval_alpha*x*x*y*y*z; - basis_z_eval[ipt + 4*npts] = x*x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 5*npts] = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 6*npts] = radial_eval_alpha*x*y*y*y*z; - basis_z_eval[ipt + 7*npts] = x*y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 8*npts] = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 9*npts] = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 10*npts] = radial_eval_alpha*y*y*y*y*z; - basis_z_eval[ipt + 11*npts] = y*y*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 12*npts] = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 13*npts] = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 14*npts] = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + basis_z_eval[ipt + 0*npts] = x0*x61; + basis_z_eval[ipt + 1*npts] = x47; + basis_z_eval[ipt + 2*npts] = x29 + x46; + basis_z_eval[ipt + 3*npts] = x6*x61; + basis_z_eval[ipt + 4*npts] = y*(x37 + x49); + basis_z_eval[ipt + 5*npts] = x4*x62 + x41; + basis_z_eval[ipt + 6*npts] = x43; + basis_z_eval[ipt + 7*npts] = x*(x33 + x55); + basis_z_eval[ipt + 8*npts] = x63*x65; + basis_z_eval[ipt + 9*npts] = x*x66; + basis_z_eval[ipt + 10*npts] = x15*x61; + basis_z_eval[ipt + 11*npts] = x30 + x58; + basis_z_eval[ipt + 12*npts] = x5*x62 + x60; + basis_z_eval[ipt + 13*npts] = x66*y; + basis_z_eval[ipt + 14*npts] = radial_eval_alpha*x67 + x14*x19; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = x*x*(12*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 1*npts] = x*y*(6*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 2*npts] = x*z*(6*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 3*npts] = x*x*y*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z) + x*x*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y) + y*y*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_lapl_eval[ipt + 4*npts] = y*z*(2*radial_eval + 11*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z); - basis_lapl_eval[ipt + 5*npts] = x*x*z*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y) + x*x*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z) + z*z*(2*radial_eval + 5*radial_eval_alpha*x*x + radial_eval_alpha_squared*x*x*x*x); - basis_lapl_eval[ipt + 6*npts] = x*y*(6*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 7*npts] = x*z*(2*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 8*npts] = x*y*(2*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_lapl_eval[ipt + 9*npts] = x*z*(6*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_lapl_eval[ipt + 10*npts] = y*y*(12*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 11*npts] = y*z*(6*radial_eval + 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z); - basis_lapl_eval[ipt + 12*npts] = y*y*z*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x) + y*y*(2*radial_eval + 5*radial_eval_alpha*z*z + radial_eval_alpha_squared*z*z*z*z) + z*z*(2*radial_eval + 5*radial_eval_alpha*y*y + radial_eval_alpha_squared*y*y*y*y); - basis_lapl_eval[ipt + 13*npts] = y*z*(6*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); - basis_lapl_eval[ipt + 14*npts] = z*z*(12*radial_eval + 11*radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*z*z + radial_eval_alpha_squared*z*z*z*z); + basis_lapl_eval[ipt + 0*npts] = x123 + x142 + x71; + basis_lapl_eval[ipt + 1*npts] = x155*y; + basis_lapl_eval[ipt + 2*npts] = x155*z; + basis_lapl_eval[ipt + 3*npts] = x127 + x144 + x6*x69 + x79; + basis_lapl_eval[ipt + 4*npts] = x25*(x129 + x145 + x4*x69 + x80); + basis_lapl_eval[ipt + 5*npts] = x130 + x146 + x69*x9 + x83; + basis_lapl_eval[ipt + 6*npts] = x*x156; + basis_lapl_eval[ipt + 7*npts] = x52*(x133 + x148 + x5*x69 + x88); + basis_lapl_eval[ipt + 8*npts] = x63*(x134 + x149 + x69*x8 + x90); + basis_lapl_eval[ipt + 9*npts] = x*x157; + basis_lapl_eval[ipt + 10*npts] = x136 + x152 + x93; + basis_lapl_eval[ipt + 11*npts] = x156*z; + basis_lapl_eval[ipt + 12*npts] = x138 + x153 + x16*x69 + x94; + basis_lapl_eval[ipt + 13*npts] = x157*y; + basis_lapl_eval[ipt + 14*npts] = x139 + x154 + x95; + @@ -206,36 +370,36 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = radial_eval*x*x*x*x; - ang_eval_1 = radial_eval*x*x*x*y; - ang_eval_2 = radial_eval*x*x*x*z; - ang_eval_3 = radial_eval*x*x*y*y; + ang_eval_0 = radial_eval*x0; + ang_eval_1 = x1*x2; + ang_eval_2 = x2*x3; + ang_eval_3 = radial_eval*x6; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*x*y*z; - ang_eval_1 = radial_eval*x*x*z*z; - ang_eval_2 = radial_eval*x*y*y*y; - ang_eval_3 = radial_eval*x*y*y*z; + ang_eval_0 = x4*x7; + ang_eval_1 = radial_eval*x9; + ang_eval_2 = x10*x11; + ang_eval_3 = x12*x5; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = radial_eval*x*y*z*z; - ang_eval_1 = radial_eval*x*z*z*z; - ang_eval_2 = radial_eval*y*y*y*y; - ang_eval_3 = radial_eval*y*y*y*z; + ang_eval_0 = x13*x8; + ang_eval_1 = x10*x14; + ang_eval_2 = radial_eval*x15; + ang_eval_3 = x11*x3; basis_eval[ipt + 8*npts] = ang_eval_0; basis_eval[ipt + 9*npts] = ang_eval_1; basis_eval[ipt + 10*npts] = ang_eval_2; basis_eval[ipt + 11*npts] = ang_eval_3; - ang_eval_0 = radial_eval*y*y*z*z; - ang_eval_1 = radial_eval*y*z*z*z; - ang_eval_2 = radial_eval*z*z*z*z; + ang_eval_0 = radial_eval*x16; + ang_eval_1 = x1*x14; + ang_eval_2 = radial_eval*x17; basis_eval[ipt + 12*npts] = ang_eval_0; basis_eval[ipt + 13*npts] = ang_eval_1; basis_eval[ipt + 14*npts] = ang_eval_2; @@ -246,18 +410,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = x*x*x*(4*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = radial_eval_alpha*x*x*x*x*y; - dang_eval_z_0 = radial_eval_alpha*x*x*x*x*z; - dang_eval_x_1 = x*x*y*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = x*x*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = radial_eval_alpha*x*x*x*y*z; - dang_eval_x_2 = x*x*z*(3*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = radial_eval_alpha*x*x*x*y*z; - dang_eval_z_2 = x*x*x*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_3 = x*y*y*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*x*y*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = radial_eval_alpha*x*x*y*y*z; + dang_eval_x_0 = radial_eval_alpha*x18 + x19*x2; + dang_eval_y_0 = x0*x45; + dang_eval_z_0 = x0*x61; + dang_eval_x_1 = x21*y; + dang_eval_y_1 = x24 + x46; + dang_eval_z_1 = x47; + dang_eval_x_2 = x21*z; + dang_eval_y_2 = x47; + dang_eval_z_2 = x29 + x46; + dang_eval_x_3 = x22*x5 + x24; + dang_eval_y_3 = x32 + x4*x48; + dang_eval_z_3 = x6*x61; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -271,18 +435,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = x*y*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = x*z*z*(2*radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*x*y*z*z; - dang_eval_z_1 = x*x*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = y*y*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_2 = x*y*y*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*x*y*y*y*z; - dang_eval_x_3 = y*y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = x*y*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = x*y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x25*x27; + dang_eval_y_0 = x50*z; + dang_eval_z_0 = y*(x37 + x49); + dang_eval_x_1 = x22*x8 + x29; + dang_eval_y_1 = x45*x9; + dang_eval_z_1 = x4*x62 + x41; + dang_eval_x_2 = x30 + x32; + dang_eval_y_2 = x*x51; + dang_eval_z_2 = x43; + dang_eval_x_3 = x35*z; + dang_eval_y_3 = x52*x54; + dang_eval_z_3 = x*(x33 + x55); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -296,18 +460,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = y*z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = x*z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = x*y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = z*z*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = radial_eval_alpha*x*y*z*z*z; - dang_eval_z_1 = x*z*z*(3*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*y*y*y*y; - dang_eval_y_2 = y*y*y*(4*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_2 = radial_eval_alpha*y*y*y*y*z; - dang_eval_x_3 = radial_eval_alpha*x*y*y*y*z; - dang_eval_y_3 = y*y*z*(3*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_3 = y*y*y*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x38*y; + dang_eval_y_0 = x*(x36 + x55); + dang_eval_z_0 = x63*x65; + dang_eval_x_1 = x39 + x41; + dang_eval_y_1 = x44; + dang_eval_z_1 = x*x66; + dang_eval_x_2 = x15*x42; + dang_eval_y_2 = radial_eval_alpha*x56 + x11*x19; + dang_eval_z_2 = x15*x61; + dang_eval_x_3 = x43; + dang_eval_y_3 = x51*z; + dang_eval_z_3 = x30 + x58; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; @@ -321,15 +485,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 11*npts] = dang_eval_y_3; basis_z_eval[ipt + 11*npts] = dang_eval_z_3; - dang_eval_x_0 = radial_eval_alpha*x*y*y*z*z; - dang_eval_y_0 = y*z*z*(2*radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = y*y*z*(2*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_1 = radial_eval_alpha*x*y*z*z*z; - dang_eval_y_1 = z*z*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = y*z*z*(3*radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = radial_eval_alpha*x*z*z*z*z; - dang_eval_y_2 = radial_eval_alpha*y*z*z*z*z; - dang_eval_z_2 = z*z*z*(4*radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x16*x42; + dang_eval_y_0 = x48*x8 + x58; + dang_eval_z_0 = x5*x62 + x60; + dang_eval_x_1 = x44; + dang_eval_y_1 = x39 + x60; + dang_eval_z_1 = x66*y; + dang_eval_x_2 = x17*x42; + dang_eval_y_2 = x17*x45; + dang_eval_z_2 = radial_eval_alpha*x67 + x14*x19; basis_x_eval[ipt + 12*npts] = dang_eval_x_0; basis_y_eval[ipt + 12*npts] = dang_eval_y_0; basis_z_eval[ipt + 12*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp index 0788c8ce..2dd909dc 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -105,6 +109,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp index 4dd7dac5..2b74a4f0 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,7 +102,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -119,6 +123,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp index 6dff65ca..bb156174 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_0( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_hessian_0( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,7 +111,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = radial_eval_alpha_squared*(x*x); + const auto x1 = radial_eval_alpha_squared*x; + const auto x2 = radial_eval_alpha_squared*(y*y); + const auto x3 = radial_eval_alpha_squared*(z*z); + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -125,22 +133,24 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*x*x; + basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x0; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y; + basis_xy_eval[ipt + 0*npts] = x1*y; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*z; + basis_xz_eval[ipt + 0*npts] = x1*z; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*y*y; + basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x2; // Evaluate second derivative of bfn wrt yz basis_yz_eval[ipt + 0*npts] = radial_eval_alpha_squared*y*z; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + radial_eval_alpha_squared*z*z; + basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x3; + + diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp new file mode 100644 index 00000000..a6f5542c --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp @@ -0,0 +1,208 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_0( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = x*x; + const auto x1 = radial_eval_alpha_squared*x0; + const auto x2 = radial_eval_alpha_squared*x; + const auto x3 = y*y; + const auto x4 = radial_eval_alpha_squared*x3; + const auto x5 = radial_eval_alpha_squared*y; + const auto x6 = z*z; + const auto x7 = radial_eval_alpha_squared*x6; + const auto x8 = radial_eval_alpha_cubed*x; + const auto x9 = radial_eval_alpha_cubed*y; + const auto x10 = radial_eval_alpha_cubed*z; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval_alpha*y; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = radial_eval_alpha + x1; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x2*y; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x2*z; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = radial_eval_alpha + x4; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x5*z; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = radial_eval_alpha + x7; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x1 + x4 + x7; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(x*x*x) + 5.0*x2 + x3*x8 + x6*x8; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(y*y*y) + x0*x9 + 5.0*x5 + x6*x9; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = radial_eval_alpha_cubed*(z*z*z) + 5.0*radial_eval_alpha_squared*z + x0*x10 + x10*x3; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + + + ang_eval_0 = radial_eval; + basis_eval[ipt + 0*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + + dang_eval_x_0 = radial_eval_alpha*x; + dang_eval_y_0 = radial_eval_alpha*y; + dang_eval_z_0 = radial_eval_alpha*z; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp index de0353b5..fad0a511 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_0( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_0( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,7 +106,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = radial_eval_alpha_squared*(x*x); + const auto x1 = radial_eval_alpha_squared*x; + const auto x2 = radial_eval_alpha_squared*(y*y); + const auto x3 = radial_eval_alpha_squared*(z*z); + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval; @@ -119,8 +127,10 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt z basis_z_eval[ipt + 0*npts] = radial_eval_alpha*z; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = 3*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z; + basis_lapl_eval[ipt + 0*npts] = 3.0*radial_eval_alpha + x0 + x2 + x3; + diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp index 709c0298..9e867997 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,7 +96,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*y; @@ -107,6 +111,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp index 0fe5eb9d..bed3c691 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_1( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_gradient_1( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,7 +102,12 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = radial_eval_alpha*x; + const auto x1 = x0*y; + const auto x2 = x0*z; + const auto x3 = radial_eval_alpha*y*z; + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*y; @@ -109,19 +117,21 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x*y; - basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*z; - basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x; + basis_x_eval[ipt + 0*npts] = x1; + basis_x_eval[ipt + 1*npts] = x2; + basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*(x*x); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*y*y; - basis_y_eval[ipt + 1*npts] = radial_eval_alpha*y*z; - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y; + basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*(y*y); + basis_y_eval[ipt + 1*npts] = x3; + basis_y_eval[ipt + 2*npts] = x1; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*y*z; - basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*z*z; - basis_z_eval[ipt + 2*npts] = radial_eval_alpha*x*z; + basis_z_eval[ipt + 0*npts] = x3; + basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*(z*z); + basis_z_eval[ipt + 2*npts] = x2; + + @@ -149,15 +159,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; - dang_eval_x_0 = radial_eval_alpha*x*y; - dang_eval_y_0 = radial_eval + radial_eval_alpha*y*y; - dang_eval_z_0 = radial_eval_alpha*y*z; - dang_eval_x_1 = radial_eval_alpha*x*z; - dang_eval_y_1 = radial_eval_alpha*y*z; - dang_eval_z_1 = radial_eval + radial_eval_alpha*z*z; - dang_eval_x_2 = radial_eval + radial_eval_alpha*x*x; - dang_eval_y_2 = radial_eval_alpha*x*y; - dang_eval_z_2 = radial_eval_alpha*x*z; + dang_eval_x_0 = x1; + dang_eval_y_0 = radial_eval + radial_eval_alpha*(y*y); + dang_eval_z_0 = x3; + dang_eval_x_1 = x2; + dang_eval_y_1 = x3; + dang_eval_z_1 = radial_eval + radial_eval_alpha*(z*z); + dang_eval_x_2 = radial_eval + radial_eval_alpha*(x*x); + dang_eval_y_2 = x1; + dang_eval_z_2 = x2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp index e70d24f8..273f5df5 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_1( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_1( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,7 +111,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = radial_eval_alpha*x; + const auto x1 = x0*y; + const auto x2 = x0*z; + const auto x3 = x*x; + const auto x4 = y*y; + const auto x5 = y*z; + const auto x6 = radial_eval_alpha*x5; + const auto x7 = z*z; + const auto x8 = radial_eval_alpha_squared*x3; + const auto x9 = radial_eval_alpha + x8; + const auto x10 = x9*y; + const auto x11 = x9*z; + const auto x12 = 3.0*radial_eval_alpha; + const auto x13 = radial_eval_alpha_squared*x4; + const auto x14 = radial_eval_alpha + x13; + const auto x15 = x*x14; + const auto x16 = radial_eval_alpha_squared*x*x5; + const auto x17 = radial_eval_alpha_squared*x7; + const auto x18 = radial_eval_alpha + x17; + const auto x19 = x*x18; + const auto x20 = x14*z; + const auto x21 = x18*y; + const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x8; + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*y; @@ -118,49 +145,51 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x*y; - basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*z; - basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x; + basis_x_eval[ipt + 0*npts] = x1; + basis_x_eval[ipt + 1*npts] = x2; + basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x3; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*y*y; - basis_y_eval[ipt + 1*npts] = radial_eval_alpha*y*z; - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y; + basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x4; + basis_y_eval[ipt + 1*npts] = x6; + basis_y_eval[ipt + 2*npts] = x1; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*y*z; - basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*z*z; - basis_z_eval[ipt + 2*npts] = radial_eval_alpha*x*z; + basis_z_eval[ipt + 0*npts] = x6; + basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x7; + basis_z_eval[ipt + 2*npts] = x2; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 1*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = x*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xx_eval[ipt + 0*npts] = x10; + basis_xx_eval[ipt + 1*npts] = x11; + basis_xx_eval[ipt + 2*npts] = x*(x12 + x8); // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 1*npts] = radial_eval_alpha_squared*x*y*z; - basis_xy_eval[ipt + 2*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xy_eval[ipt + 0*npts] = x15; + basis_xy_eval[ipt + 1*npts] = x16; + basis_xy_eval[ipt + 2*npts] = x10; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = radial_eval_alpha_squared*x*y*z; - basis_xz_eval[ipt + 1*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 2*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); + basis_xz_eval[ipt + 0*npts] = x16; + basis_xz_eval[ipt + 1*npts] = x19; + basis_xz_eval[ipt + 2*npts] = x11; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 1*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*y*y); + basis_yy_eval[ipt + 0*npts] = y*(x12 + x13); + basis_yy_eval[ipt + 1*npts] = x20; + basis_yy_eval[ipt + 2*npts] = x15; // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 1*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 2*npts] = radial_eval_alpha_squared*x*y*z; + basis_yz_eval[ipt + 0*npts] = x20; + basis_yz_eval[ipt + 1*npts] = x21; + basis_yz_eval[ipt + 2*npts] = x16; // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 1*npts] = z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = x*(radial_eval_alpha + radial_eval_alpha_squared*z*z); + basis_zz_eval[ipt + 0*npts] = x21; + basis_zz_eval[ipt + 1*npts] = z*(x12 + x17); + basis_zz_eval[ipt + 2*npts] = x19; + + @@ -187,15 +216,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; - dang_eval_x_0 = radial_eval_alpha*x*y; - dang_eval_y_0 = radial_eval + radial_eval_alpha*y*y; - dang_eval_z_0 = radial_eval_alpha*y*z; - dang_eval_x_1 = radial_eval_alpha*x*z; - dang_eval_y_1 = radial_eval_alpha*y*z; - dang_eval_z_1 = radial_eval + radial_eval_alpha*z*z; - dang_eval_x_2 = radial_eval + radial_eval_alpha*x*x; - dang_eval_y_2 = radial_eval_alpha*x*y; - dang_eval_z_2 = radial_eval_alpha*x*z; + dang_eval_x_0 = x1; + dang_eval_y_0 = radial_eval + radial_eval_alpha*x4; + dang_eval_z_0 = x6; + dang_eval_x_1 = x2; + dang_eval_y_1 = x6; + dang_eval_z_1 = radial_eval + radial_eval_alpha*x7; + dang_eval_x_2 = radial_eval + radial_eval_alpha*x3; + dang_eval_y_2 = x1; + dang_eval_z_2 = x2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp new file mode 100644 index 00000000..e0983fed --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp @@ -0,0 +1,285 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_1( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = radial_eval_alpha*x; + const auto x1 = x0*y; + const auto x2 = x0*z; + const auto x3 = x*x; + const auto x4 = x3; + const auto x5 = y*y; + const auto x6 = x5; + const auto x7 = y*z; + const auto x8 = radial_eval_alpha*x7; + const auto x9 = z*z; + const auto x10 = x9; + const auto x11 = radial_eval_alpha_squared*x4; + const auto x12 = radial_eval_alpha + x11; + const auto x13 = x12*y; + const auto x14 = x12*z; + const auto x15 = 3.0*radial_eval_alpha; + const auto x16 = radial_eval_alpha_squared*x6; + const auto x17 = radial_eval_alpha + x16; + const auto x18 = x*x17; + const auto x19 = radial_eval_alpha_squared*x*x7; + const auto x20 = radial_eval_alpha_squared*x10; + const auto x21 = radial_eval_alpha + x20; + const auto x22 = x*x21; + const auto x23 = x17*z; + const auto x24 = x21*y; + const auto x25 = 5.0*radial_eval_alpha; + const auto x26 = x16 + x20 + x25; + const auto x27 = x11 + x26; + const auto x28 = 5.0*radial_eval_alpha_squared; + const auto x29 = radial_eval_alpha_cubed*(x*x*x); + const auto x30 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; + const auto x31 = radial_eval_alpha_cubed*x10 + radial_eval_alpha_squared; + const auto x32 = x*x28 + x*x30 + x*x31 + x29; + const auto x33 = 3.0*radial_eval_alpha_squared; + const auto x34 = radial_eval_alpha_cubed*(y*y*y); + const auto x35 = radial_eval_alpha_cubed*x4 + radial_eval_alpha_squared; + const auto x36 = x11 + x25; + const auto x37 = x28*y + x31*y + x34 + x35*y; + const auto x38 = radial_eval_alpha_cubed*(z*z*z); + const auto x39 = x28*z + x30*z + x35*z + x38; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*y; + basis_eval[ipt + 1*npts] = radial_eval*z; + basis_eval[ipt + 2*npts] = radial_eval*x; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x1; + basis_x_eval[ipt + 1*npts] = x2; + basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x4; + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x6; + basis_y_eval[ipt + 1*npts] = x8; + basis_y_eval[ipt + 2*npts] = x1; + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x8; + basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x10; + basis_z_eval[ipt + 2*npts] = x2; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x13; + basis_xx_eval[ipt + 1*npts] = x14; + basis_xx_eval[ipt + 2*npts] = x*(x11 + x15); + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x18; + basis_xy_eval[ipt + 1*npts] = x19; + basis_xy_eval[ipt + 2*npts] = x13; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x19; + basis_xz_eval[ipt + 1*npts] = x22; + basis_xz_eval[ipt + 2*npts] = x14; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = y*(x15 + x16); + basis_yy_eval[ipt + 1*npts] = x23; + basis_yy_eval[ipt + 2*npts] = x18; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x23; + basis_yz_eval[ipt + 1*npts] = x24; + basis_yz_eval[ipt + 2*npts] = x19; + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x24; + basis_zz_eval[ipt + 1*npts] = z*(x15 + x20); + basis_zz_eval[ipt + 2*npts] = x22; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x27*y; + basis_lapl_eval[ipt + 1*npts] = x27*z; + basis_lapl_eval[ipt + 2*npts] = x*x27; + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x32*y; + basis_lapl_x_eval[ipt + 1*npts] = x32*z; + basis_lapl_x_eval[ipt + 2*npts] = x*(x*x33 + x29) + x26 + x3*x30 + x3*x31 + x33*x4; + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x20 + x31*x5 + x33*x6 + x35*x5 + x36 + y*(x33*y + x34); + basis_lapl_y_eval[ipt + 1*npts] = x37*z; + basis_lapl_y_eval[ipt + 2*npts] = x*x37; + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x39*y; + basis_lapl_z_eval[ipt + 1*npts] = x10*x33 + x16 + x30*x9 + x35*x9 + x36 + z*(x33*z + x38); + basis_lapl_z_eval[ipt + 2*npts] = x*x39; + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + + + ang_eval_0 = radial_eval*y; + ang_eval_1 = radial_eval*z; + ang_eval_2 = radial_eval*x; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + + dang_eval_x_0 = x1; + dang_eval_y_0 = radial_eval + radial_eval_alpha*x6; + dang_eval_z_0 = x8; + dang_eval_x_1 = x2; + dang_eval_y_1 = x8; + dang_eval_z_1 = radial_eval + radial_eval_alpha*x10; + dang_eval_x_2 = radial_eval + radial_eval_alpha*x4; + dang_eval_y_2 = x1; + dang_eval_z_2 = x2; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp index 32575b6e..2da0a731 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_1( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_1( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,7 +106,31 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = radial_eval_alpha*x; + const auto x1 = x0*y; + const auto x2 = x0*z; + const auto x3 = x*x; + const auto x4 = y*y; + const auto x5 = y*z; + const auto x6 = radial_eval_alpha*x5; + const auto x7 = z*z; + const auto x8 = radial_eval_alpha_squared*x3; + const auto x9 = radial_eval_alpha + x8; + const auto x10 = x9*y; + const auto x11 = x9*z; + const auto x12 = 3.0*radial_eval_alpha; + const auto x13 = radial_eval_alpha_squared*x4; + const auto x14 = radial_eval_alpha + x13; + const auto x15 = x*x14; + const auto x16 = radial_eval_alpha_squared*x*x5; + const auto x17 = radial_eval_alpha_squared*x7; + const auto x18 = radial_eval_alpha + x17; + const auto x19 = x*x18; + const auto x20 = x14*z; + const auto x21 = x18*y; + const auto x22 = 5.0*radial_eval_alpha + x13 + x17 + x8; + // Evaluate basis function basis_eval[ipt + 0*npts] = radial_eval*y; @@ -113,24 +140,26 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = radial_eval_alpha*x*y; - basis_x_eval[ipt + 1*npts] = radial_eval_alpha*x*z; - basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x*x; + basis_x_eval[ipt + 0*npts] = x1; + basis_x_eval[ipt + 1*npts] = x2; + basis_x_eval[ipt + 2*npts] = radial_eval + radial_eval_alpha*x3; // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*y*y; - basis_y_eval[ipt + 1*npts] = radial_eval_alpha*y*z; - basis_y_eval[ipt + 2*npts] = radial_eval_alpha*x*y; + basis_y_eval[ipt + 0*npts] = radial_eval + radial_eval_alpha*x4; + basis_y_eval[ipt + 1*npts] = x6; + basis_y_eval[ipt + 2*npts] = x1; // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = radial_eval_alpha*y*z; - basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*z*z; - basis_z_eval[ipt + 2*npts] = radial_eval_alpha*x*z; + basis_z_eval[ipt + 0*npts] = x6; + basis_z_eval[ipt + 1*npts] = radial_eval + radial_eval_alpha*x7; + basis_z_eval[ipt + 2*npts] = x2; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = y*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 1*npts] = z*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 2*npts] = x*(5*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); + basis_lapl_eval[ipt + 0*npts] = x22*y; + basis_lapl_eval[ipt + 1*npts] = x22*z; + basis_lapl_eval[ipt + 2*npts] = x*x22; + @@ -157,15 +186,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; - dang_eval_x_0 = radial_eval_alpha*x*y; - dang_eval_y_0 = radial_eval + radial_eval_alpha*y*y; - dang_eval_z_0 = radial_eval_alpha*y*z; - dang_eval_x_1 = radial_eval_alpha*x*z; - dang_eval_y_1 = radial_eval_alpha*y*z; - dang_eval_z_1 = radial_eval + radial_eval_alpha*z*z; - dang_eval_x_2 = radial_eval + radial_eval_alpha*x*x; - dang_eval_y_2 = radial_eval_alpha*x*y; - dang_eval_z_2 = radial_eval_alpha*x*z; + dang_eval_x_0 = x1; + dang_eval_y_0 = radial_eval + radial_eval_alpha*x4; + dang_eval_z_0 = x6; + dang_eval_x_1 = x2; + dang_eval_y_1 = x6; + dang_eval_z_1 = radial_eval + radial_eval_alpha*x7; + dang_eval_x_2 = radial_eval + radial_eval_alpha*x3; + dang_eval_y_2 = x1; + dang_eval_z_2 = x2; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp index f29e2496..38e16774 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,14 +96,19 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + const auto x0 = radial_eval*sqrt_3*y; + const auto x1 = 0.5*radial_eval; + const auto x2 = x*x; + const auto x3 = y*y; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y; - basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z; - basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z; - basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2; + basis_eval[ipt + 0*npts] = x*x0; + basis_eval[ipt + 1*npts] = x0*z; + basis_eval[ipt + 2*npts] = -x1*(x2 + x3 - 2.0*z*z); + basis_eval[ipt + 3*npts] = radial_eval*sqrt_3*x*z; + basis_eval[ipt + 4*npts] = sqrt_3*x1*(x2 - x3); @@ -109,6 +117,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn @@ -120,16 +130,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*x*y; - ang_eval_1 = sqrt_3*radial_eval*y*z; - ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2; - ang_eval_3 = sqrt_3*radial_eval*x*z; + ang_eval_0 = x*x0; + ang_eval_1 = x0*z; + ang_eval_2 = -x1*(x2 + x3 - 2.0*z*z); + ang_eval_3 = radial_eval*sqrt_3*x*z; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2; + ang_eval_0 = sqrt_3*x1*(x2 - x3); basis_eval[ipt + 4*npts] = ang_eval_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp index c14931c8..52ddc601 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_2( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_gradient_2( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,37 +102,60 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = sqrt_3*y; + const auto x1 = radial_eval*x0; + const auto x2 = 0.5*radial_eval; + const auto x3 = x*x; + const auto x4 = y*y; + const auto x5 = z*z; + const auto x6 = -x3 - x4 + 2.0*x5; + const auto x7 = sqrt_3*z; + const auto x8 = x3 - x4; + const auto x9 = radial_eval + radial_eval_alpha*x3; + const auto x10 = radial_eval_alpha*x*x0*z; + const auto x11 = 0.5*x; + const auto x12 = 2.0*radial_eval; + const auto x13 = -x12; + const auto x14 = radial_eval_alpha*x6; + const auto x15 = x13 + x14; + const auto x16 = radial_eval_alpha*x8; + const auto x17 = sqrt_3*x; + const auto x18 = radial_eval + radial_eval_alpha*x4; + const auto x19 = radial_eval + radial_eval_alpha*x5; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y; - basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z; - basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z; - basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2; + basis_eval[ipt + 0*npts] = x*x1; + basis_eval[ipt + 1*npts] = x1*z; + basis_eval[ipt + 2*npts] = x2*x6; + basis_eval[ipt + 3*npts] = radial_eval*x*x7; + basis_eval[ipt + 4*npts] = sqrt_3*x2*x8; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; + basis_x_eval[ipt + 0*npts] = x0*x9; + basis_x_eval[ipt + 1*npts] = x10; + basis_x_eval[ipt + 2*npts] = x11*x15; + basis_x_eval[ipt + 3*npts] = x7*x9; + basis_x_eval[ipt + 4*npts] = sqrt_3*x11*(x12 + x16); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; + basis_y_eval[ipt + 0*npts] = x17*x18; + basis_y_eval[ipt + 1*npts] = x18*x7; + basis_y_eval[ipt + 2*npts] = 0.5*x15*y; + basis_y_eval[ipt + 3*npts] = x10; + basis_y_eval[ipt + 4*npts] = 0.5*x0*(x13 + x16); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_z_eval[ipt + 1*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 2*npts] = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_z_eval[ipt + 3*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2; + basis_z_eval[ipt + 0*npts] = x10; + basis_z_eval[ipt + 1*npts] = x0*x19; + basis_z_eval[ipt + 2*npts] = 0.5*z*(4.0*radial_eval + x14); + basis_z_eval[ipt + 3*npts] = x17*x19; + basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x7*x8; + + @@ -146,16 +172,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*x*y; - ang_eval_1 = sqrt_3*radial_eval*y*z; - ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2; - ang_eval_3 = sqrt_3*radial_eval*x*z; + ang_eval_0 = x*x1; + ang_eval_1 = x1*z; + ang_eval_2 = x2*x6; + ang_eval_3 = radial_eval*x*x7; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2; + ang_eval_0 = sqrt_3*x2*x8; basis_eval[ipt + 4*npts] = ang_eval_0; @@ -164,18 +190,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_z_3 = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x0*x9; + dang_eval_y_0 = x17*x18; + dang_eval_z_0 = x10; + dang_eval_x_1 = x10; + dang_eval_y_1 = x18*x7; + dang_eval_z_1 = x0*x19; + dang_eval_x_2 = x11*x15; + dang_eval_y_2 = 0.5*x15*y; + dang_eval_z_2 = 0.5*z*(4.0*radial_eval + x14); + dang_eval_x_3 = x7*x9; + dang_eval_y_3 = x10; + dang_eval_z_3 = x17*x19; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -189,9 +215,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_y_0 = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_z_0 = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2; + dang_eval_x_0 = sqrt_3*x11*(x12 + x16); + dang_eval_y_0 = 0.5*x0*(x13 + x16); + dang_eval_z_0 = 0.5*radial_eval_alpha*x7*x8; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp index 400ee30e..329138f3 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_2( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_2( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,79 +111,137 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = sqrt_3*y; + const auto x1 = x*x0; + const auto x2 = x0*z; + const auto x3 = 0.5*radial_eval; + const auto x4 = x*x; + const auto x5 = y*y; + const auto x6 = z*z; + const auto x7 = -x4 - x5 + 2.0*x6; + const auto x8 = sqrt_3*z; + const auto x9 = x*x8; + const auto x10 = x4 - x5; + const auto x11 = radial_eval + radial_eval_alpha*x4; + const auto x12 = radial_eval_alpha*x1*z; + const auto x13 = 0.5*x; + const auto x14 = 2.0*radial_eval; + const auto x15 = -x14; + const auto x16 = radial_eval_alpha*x7; + const auto x17 = x15 + x16; + const auto x18 = radial_eval_alpha*x10; + const auto x19 = sqrt_3*x; + const auto x20 = radial_eval_alpha*x5; + const auto x21 = radial_eval + x20; + const auto x22 = 0.5*y; + const auto x23 = radial_eval_alpha*x6; + const auto x24 = radial_eval + x23; + const auto x25 = 0.5*z; + const auto x26 = 4.0*radial_eval; + const auto x27 = 3.0*radial_eval_alpha; + const auto x28 = radial_eval_alpha_squared*x4; + const auto x29 = x27 + x28; + const auto x30 = radial_eval_alpha + x28; + const auto x31 = x2*x30; + const auto x32 = 4.0*radial_eval_alpha; + const auto x33 = x32*x4; + const auto x34 = x14 + x33; + const auto x35 = 0.5*sqrt_3; + const auto x36 = x10*x30; + const auto x37 = radial_eval_alpha_squared*x5; + const auto x38 = radial_eval_alpha + x37; + const auto x39 = x38*x9; + const auto x40 = radial_eval_alpha_squared*x7; + const auto x41 = radial_eval_alpha_squared*x6; + const auto x42 = radial_eval_alpha + x41; + const auto x43 = x1*x42; + const auto x44 = 2.0*radial_eval_alpha; + const auto x45 = x40 + x44; + const auto x46 = radial_eval_alpha_squared*x10; + const auto x47 = x27 + x37; + const auto x48 = x32*x5; + const auto x49 = x14 + x48; + const auto x50 = x27 + x41; + const auto x51 = 8.0*radial_eval_alpha*x6 + x42*x7; + const auto x52 = x10*x42; + const auto x53 = 7.0*radial_eval_alpha + x28 + x37 + x41; + const auto x54 = -x48; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y; - basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z; - basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z; - basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2; + basis_eval[ipt + 0*npts] = radial_eval*x1; + basis_eval[ipt + 1*npts] = radial_eval*x2; + basis_eval[ipt + 2*npts] = x3*x7; + basis_eval[ipt + 3*npts] = radial_eval*x9; + basis_eval[ipt + 4*npts] = sqrt_3*x10*x3; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; + basis_x_eval[ipt + 0*npts] = x0*x11; + basis_x_eval[ipt + 1*npts] = x12; + basis_x_eval[ipt + 2*npts] = x13*x17; + basis_x_eval[ipt + 3*npts] = x11*x8; + basis_x_eval[ipt + 4*npts] = sqrt_3*x13*(x14 + x18); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; + basis_y_eval[ipt + 0*npts] = x19*x21; + basis_y_eval[ipt + 1*npts] = x21*x8; + basis_y_eval[ipt + 2*npts] = x17*x22; + basis_y_eval[ipt + 3*npts] = x12; + basis_y_eval[ipt + 4*npts] = 0.5*x0*(x15 + x18); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_z_eval[ipt + 1*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 2*npts] = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_z_eval[ipt + 3*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2; + basis_z_eval[ipt + 0*npts] = x12; + basis_z_eval[ipt + 1*npts] = x0*x24; + basis_z_eval[ipt + 2*npts] = x25*(x16 + x26); + basis_z_eval[ipt + 3*npts] = x19*x24; + basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x10*x8; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = sqrt_3*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 1*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = -radial_eval - 2*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 2*z*z)/2; - basis_xx_eval[ipt + 3*npts] = sqrt_3*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 4*npts] = sqrt_3*(radial_eval + 2*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y)/2); + basis_xx_eval[ipt + 0*npts] = x1*x29; + basis_xx_eval[ipt + 1*npts] = x31; + basis_xx_eval[ipt + 2*npts] = 0.5*x30*x7 - 0.5*x34; + basis_xx_eval[ipt + 3*npts] = x29*x9; + basis_xx_eval[ipt + 4*npts] = x35*(x34 + x36); // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 1*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_xy_eval[ipt + 2*npts] = x*y*(-4*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; - basis_xy_eval[ipt + 3*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xy_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha_squared*x*y*(x*x - y*y)/2; + basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval_alpha_squared*x4*x5 + x11 + x20); + basis_xy_eval[ipt + 1*npts] = x39; + basis_xy_eval[ipt + 2*npts] = x13*y*(-x32 + x40); + basis_xy_eval[ipt + 3*npts] = x31; + basis_xy_eval[ipt + 4*npts] = radial_eval_alpha_squared*x0*x10*x13; // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = sqrt_3*y*z*(radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xz_eval[ipt + 1*npts] = sqrt_3*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_xz_eval[ipt + 2*npts] = x*z*(2*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; - basis_xz_eval[ipt + 3*npts] = sqrt_3*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 4*npts] = sqrt_3*x*z*(2*radial_eval_alpha + radial_eval_alpha_squared*(x*x - y*y))/2; + basis_xz_eval[ipt + 0*npts] = x31; + basis_xz_eval[ipt + 1*npts] = x43; + basis_xz_eval[ipt + 2*npts] = x13*x45*z; + basis_xz_eval[ipt + 3*npts] = sqrt_3*(radial_eval_alpha_squared*x4*x6 + x11 + x23); + basis_xz_eval[ipt + 4*npts] = x13*x8*(x44 + x46); // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = sqrt_3*x*y*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 1*npts] = sqrt_3*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = -radial_eval - 2*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 2*z*z)/2; - basis_yy_eval[ipt + 3*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 4*npts] = sqrt_3*(-radial_eval - 2*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y)/2); + basis_yy_eval[ipt + 0*npts] = x1*x47; + basis_yy_eval[ipt + 1*npts] = x2*x47; + basis_yy_eval[ipt + 2*npts] = 0.5*x38*x7 - 0.5*x49; + basis_yy_eval[ipt + 3*npts] = x39; + basis_yy_eval[ipt + 4*npts] = x35*(x10*x38 - x49); // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = sqrt_3*x*z*(radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yz_eval[ipt + 1*npts] = sqrt_3*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 2*npts] = y*z*(2*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 2*z*z))/2; - basis_yz_eval[ipt + 3*npts] = sqrt_3*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_yz_eval[ipt + 4*npts] = sqrt_3*y*z*(-2*radial_eval_alpha + radial_eval_alpha_squared*(x*x - y*y))/2; + basis_yz_eval[ipt + 0*npts] = x39; + basis_yz_eval[ipt + 1*npts] = sqrt_3*(radial_eval_alpha_squared*x5*x6 + x21 + x23); + basis_yz_eval[ipt + 2*npts] = x22*x45*z; + basis_yz_eval[ipt + 3*npts] = x43; + basis_yz_eval[ipt + 4*npts] = x0*x25*(-x44 + x46); // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = sqrt_3*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 1*npts] = sqrt_3*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = 2*radial_eval + 4*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 2*z*z)/2; - basis_zz_eval[ipt + 3*npts] = sqrt_3*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 4*npts] = sqrt_3*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2; + basis_zz_eval[ipt + 0*npts] = x43; + basis_zz_eval[ipt + 1*npts] = x2*x50; + basis_zz_eval[ipt + 2*npts] = 0.5*x26 + 0.5*x51; + basis_zz_eval[ipt + 3*npts] = x50*x9; + basis_zz_eval[ipt + 4*npts] = x35*x52; + + @@ -196,16 +257,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*x*y; - ang_eval_1 = sqrt_3*radial_eval*y*z; - ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2; - ang_eval_3 = sqrt_3*radial_eval*x*z; + ang_eval_0 = radial_eval*x1; + ang_eval_1 = radial_eval*x2; + ang_eval_2 = x3*x7; + ang_eval_3 = radial_eval*x9; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2; + ang_eval_0 = sqrt_3*x10*x3; basis_eval[ipt + 4*npts] = ang_eval_0; @@ -214,18 +275,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_z_3 = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x0*x11; + dang_eval_y_0 = x19*x21; + dang_eval_z_0 = x12; + dang_eval_x_1 = x12; + dang_eval_y_1 = x21*x8; + dang_eval_z_1 = x0*x24; + dang_eval_x_2 = x13*x17; + dang_eval_y_2 = x17*x22; + dang_eval_z_2 = x25*(x16 + x26); + dang_eval_x_3 = x11*x8; + dang_eval_y_3 = x12; + dang_eval_z_3 = x19*x24; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -239,9 +300,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_y_0 = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_z_0 = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2; + dang_eval_x_0 = sqrt_3*x13*(x14 + x18); + dang_eval_y_0 = 0.5*x0*(x15 + x18); + dang_eval_z_0 = 0.5*radial_eval_alpha*x10*x8; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp new file mode 100644 index 00000000..9a284576 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp @@ -0,0 +1,386 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_2( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = sqrt_3*y; + const auto x1 = x*x0; + const auto x2 = x0*z; + const auto x3 = 0.5*radial_eval; + const auto x4 = x*x; + const auto x5 = x4; + const auto x6 = y*y; + const auto x7 = x6; + const auto x8 = z*z; + const auto x9 = x8; + const auto x10 = -x5 - x7 + 2.0*x9; + const auto x11 = sqrt_3*z; + const auto x12 = x*x11; + const auto x13 = x5 - x7; + const auto x14 = radial_eval + radial_eval_alpha*x5; + const auto x15 = radial_eval_alpha*x1*z; + const auto x16 = 0.5*x; + const auto x17 = 2.0*radial_eval; + const auto x18 = -x17; + const auto x19 = radial_eval_alpha*x10; + const auto x20 = x18 + x19; + const auto x21 = radial_eval_alpha*x13; + const auto x22 = sqrt_3*x; + const auto x23 = radial_eval_alpha*x7; + const auto x24 = radial_eval + x23; + const auto x25 = 0.5*y; + const auto x26 = radial_eval_alpha*x9; + const auto x27 = radial_eval + x26; + const auto x28 = 0.5*z; + const auto x29 = 4.0*radial_eval; + const auto x30 = 3.0*radial_eval_alpha; + const auto x31 = radial_eval_alpha_squared*x5; + const auto x32 = x30 + x31; + const auto x33 = radial_eval_alpha + x31; + const auto x34 = x2*x33; + const auto x35 = 4.0*radial_eval_alpha; + const auto x36 = x35*x5; + const auto x37 = x17 + x36; + const auto x38 = 0.5*sqrt_3; + const auto x39 = x13*x33; + const auto x40 = radial_eval_alpha_squared*x7; + const auto x41 = radial_eval_alpha + x40; + const auto x42 = x12*x41; + const auto x43 = radial_eval_alpha_squared*x10; + const auto x44 = radial_eval_alpha_squared*x9; + const auto x45 = radial_eval_alpha + x44; + const auto x46 = x1*x45; + const auto x47 = 2.0*radial_eval_alpha; + const auto x48 = x43 + x47; + const auto x49 = radial_eval_alpha_squared*x13; + const auto x50 = x30 + x40; + const auto x51 = x35*x7; + const auto x52 = x17 + x51; + const auto x53 = x30 + x44; + const auto x54 = 8.0*radial_eval_alpha; + const auto x55 = x10*x45 + x54*x9; + const auto x56 = x13*x45; + const auto x57 = x40 + x44; + const auto x58 = 7.0*radial_eval_alpha + x31 + x57; + const auto x59 = -x51; + const auto x60 = radial_eval_alpha_squared*x; + const auto x61 = radial_eval_alpha_cubed*(x*x*x); + const auto x62 = 3.0*x60 + x61; + const auto x63 = radial_eval_alpha_cubed*x7 + radial_eval_alpha_squared; + const auto x64 = radial_eval_alpha_cubed*x9 + radial_eval_alpha_squared; + const auto x65 = 2.0*radial_eval_alpha_squared; + const auto x66 = x*x62 + 3.0*x33 + x35 + x4*x63 + x4*x64 + x5*x65 + x57; + const auto x67 = 4.0*x60*x7; + const auto x68 = 2.0*x; + const auto x69 = 6.0*x*x33 + x*x35 + x41*x68 + x45*x68; + const auto x70 = x13*x63; + const auto x71 = x13*x64; + const auto x72 = radial_eval_alpha_squared*y; + const auto x73 = radial_eval_alpha_cubed*(y*y*y); + const auto x74 = 3.0*x72 + x73; + const auto x75 = radial_eval_alpha_cubed*x5 + radial_eval_alpha_squared; + const auto x76 = x31 + x35; + const auto x77 = 3.0*x41 + x44 + x6*x64 + x6*x75 + x65*x7 + x74*y + x76; + const auto x78 = x35*y; + const auto x79 = 4.0*x5*x72; + const auto x80 = 2.0*y; + const auto x81 = x33*x80; + const auto x82 = 6.0*x41*y; + const auto x83 = x45*x80; + const auto x84 = x13*x75; + const auto x85 = radial_eval_alpha_squared*z; + const auto x86 = radial_eval_alpha_cubed*(z*z*z); + const auto x87 = 3.0*x85 + x86; + const auto x88 = x40 + 3.0*x45 + x63*x8 + x65*x9 + x75*x8 + x76 + x87*z; + const auto x89 = 4.0*z; + const auto x90 = radial_eval_alpha_squared*x89; + const auto x91 = x5*x90; + const auto x92 = -x7*x90; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x1; + basis_eval[ipt + 1*npts] = radial_eval*x2; + basis_eval[ipt + 2*npts] = x10*x3; + basis_eval[ipt + 3*npts] = radial_eval*x12; + basis_eval[ipt + 4*npts] = sqrt_3*x13*x3; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x0*x14; + basis_x_eval[ipt + 1*npts] = x15; + basis_x_eval[ipt + 2*npts] = x16*x20; + basis_x_eval[ipt + 3*npts] = x11*x14; + basis_x_eval[ipt + 4*npts] = sqrt_3*x16*(x17 + x21); + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x22*x24; + basis_y_eval[ipt + 1*npts] = x11*x24; + basis_y_eval[ipt + 2*npts] = x20*x25; + basis_y_eval[ipt + 3*npts] = x15; + basis_y_eval[ipt + 4*npts] = 0.5*x0*(x18 + x21); + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x15; + basis_z_eval[ipt + 1*npts] = x0*x27; + basis_z_eval[ipt + 2*npts] = x28*(x19 + x29); + basis_z_eval[ipt + 3*npts] = x22*x27; + basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x11*x13; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x1*x32; + basis_xx_eval[ipt + 1*npts] = x34; + basis_xx_eval[ipt + 2*npts] = 0.5*x10*x33 - 0.5*x37; + basis_xx_eval[ipt + 3*npts] = x12*x32; + basis_xx_eval[ipt + 4*npts] = x38*(x37 + x39); + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = sqrt_3*(radial_eval_alpha_squared*x5*x7 + x14 + x23); + basis_xy_eval[ipt + 1*npts] = x42; + basis_xy_eval[ipt + 2*npts] = x16*y*(-x35 + x43); + basis_xy_eval[ipt + 3*npts] = x34; + basis_xy_eval[ipt + 4*npts] = radial_eval_alpha_squared*x0*x13*x16; + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x34; + basis_xz_eval[ipt + 1*npts] = x46; + basis_xz_eval[ipt + 2*npts] = x16*x48*z; + basis_xz_eval[ipt + 3*npts] = sqrt_3*(radial_eval_alpha_squared*x5*x9 + x14 + x26); + basis_xz_eval[ipt + 4*npts] = x11*x16*(x47 + x49); + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x1*x50; + basis_yy_eval[ipt + 1*npts] = x2*x50; + basis_yy_eval[ipt + 2*npts] = 0.5*x10*x41 - 0.5*x52; + basis_yy_eval[ipt + 3*npts] = x42; + basis_yy_eval[ipt + 4*npts] = x38*(x13*x41 - x52); + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x42; + basis_yz_eval[ipt + 1*npts] = sqrt_3*(radial_eval_alpha_squared*x7*x9 + x24 + x26); + basis_yz_eval[ipt + 2*npts] = x25*x48*z; + basis_yz_eval[ipt + 3*npts] = x46; + basis_yz_eval[ipt + 4*npts] = x0*x28*(-x47 + x49); + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x46; + basis_zz_eval[ipt + 1*npts] = x2*x53; + basis_zz_eval[ipt + 2*npts] = 0.5*x29 + 0.5*x55; + basis_zz_eval[ipt + 3*npts] = x12*x53; + basis_zz_eval[ipt + 4*npts] = x38*x56; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x1*x58; + basis_lapl_eval[ipt + 1*npts] = x2*x58; + basis_lapl_eval[ipt + 2*npts] = 0.5*x10*x33 + 0.5*x10*x41 - 0.5*x36 + 0.5*x55 + 0.5*x59; + basis_lapl_eval[ipt + 3*npts] = x12*x58; + basis_lapl_eval[ipt + 4*npts] = x38*(x13*x41 + x36 + x39 + x56 + x59); + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x0*x66; + basis_lapl_x_eval[ipt + 1*npts] = x2*(x*x63 + x*x64 + 7.0*x60 + x61); + basis_lapl_x_eval[ipt + 2*npts] = 4.0*radial_eval_alpha_squared*x*x9 + 0.5*x*x10*x63 + 0.5*x*x10*x64 + 0.5*x10*x62 - 0.5*x67 - 0.5*x69; + basis_lapl_x_eval[ipt + 3*npts] = x11*x66; + basis_lapl_x_eval[ipt + 4*npts] = x38*(x*x70 + x*x71 + x13*x62 - x67 + x69); + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x22*x77; + basis_lapl_y_eval[ipt + 1*npts] = x11*x77; + basis_lapl_y_eval[ipt + 2*npts] = 4.0*radial_eval_alpha_squared*x9*y + 0.5*x10*x64*y + 0.5*x10*x74 + 0.5*x10*x75*y - 0.5*x78 - 0.5*x79 - 0.5*x81 - 0.5*x82 - 0.5*x83; + basis_lapl_y_eval[ipt + 3*npts] = x12*(x64*y + 7.0*x72 + x73 + x75*y); + basis_lapl_y_eval[ipt + 4*npts] = x38*(x13*x74 + x71*y - x78 + x79 - x81 - x82 - x83 + x84*y); + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x1*(x63*z + x75*z + 7.0*x85 + x86); + basis_lapl_z_eval[ipt + 1*npts] = x0*x88; + basis_lapl_z_eval[ipt + 2*npts] = 0.5*x10*x63*z + 0.5*x10*x75*z + 0.5*x10*x87 + 0.5*x33*x89 + 0.5*x41*x89 + 6.0*x45*z + 0.5*x54*z - 0.5*x91 + 0.5*x92; + basis_lapl_z_eval[ipt + 3*npts] = x22*x88; + basis_lapl_z_eval[ipt + 4*npts] = x38*(x13*x87 + x70*z + x84*z + x91 + x92); + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x1; + ang_eval_1 = radial_eval*x2; + ang_eval_2 = x10*x3; + ang_eval_3 = radial_eval*x12; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = sqrt_3*x13*x3; + basis_eval[ipt + 4*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x0*x14; + dang_eval_y_0 = x22*x24; + dang_eval_z_0 = x15; + dang_eval_x_1 = x15; + dang_eval_y_1 = x11*x24; + dang_eval_z_1 = x0*x27; + dang_eval_x_2 = x16*x20; + dang_eval_y_2 = x20*x25; + dang_eval_z_2 = x28*(x19 + x29); + dang_eval_x_3 = x11*x14; + dang_eval_y_3 = x15; + dang_eval_z_3 = x22*x27; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = sqrt_3*x16*(x17 + x21); + dang_eval_y_0 = 0.5*x0*(x18 + x21); + dang_eval_z_0 = 0.5*radial_eval_alpha*x11*x13; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp index a3c3358c..7c731972 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_2( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_2( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,44 +106,102 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = sqrt_3*y; + const auto x1 = x*x0; + const auto x2 = x0*z; + const auto x3 = 0.5*radial_eval; + const auto x4 = x*x; + const auto x5 = y*y; + const auto x6 = z*z; + const auto x7 = -x4 - x5 + 2.0*x6; + const auto x8 = sqrt_3*z; + const auto x9 = x*x8; + const auto x10 = x4 - x5; + const auto x11 = radial_eval + radial_eval_alpha*x4; + const auto x12 = radial_eval_alpha*x1*z; + const auto x13 = 0.5*x; + const auto x14 = 2.0*radial_eval; + const auto x15 = -x14; + const auto x16 = radial_eval_alpha*x7; + const auto x17 = x15 + x16; + const auto x18 = radial_eval_alpha*x10; + const auto x19 = sqrt_3*x; + const auto x20 = radial_eval_alpha*x5; + const auto x21 = radial_eval + x20; + const auto x22 = 0.5*y; + const auto x23 = radial_eval_alpha*x6; + const auto x24 = radial_eval + x23; + const auto x25 = 0.5*z; + const auto x26 = 4.0*radial_eval; + const auto x27 = 3.0*radial_eval_alpha; + const auto x28 = radial_eval_alpha_squared*x4; + const auto x29 = x27 + x28; + const auto x30 = radial_eval_alpha + x28; + const auto x31 = x2*x30; + const auto x32 = 4.0*radial_eval_alpha; + const auto x33 = x32*x4; + const auto x34 = x14 + x33; + const auto x35 = 0.5*sqrt_3; + const auto x36 = x10*x30; + const auto x37 = radial_eval_alpha_squared*x5; + const auto x38 = radial_eval_alpha + x37; + const auto x39 = x38*x9; + const auto x40 = radial_eval_alpha_squared*x7; + const auto x41 = radial_eval_alpha_squared*x6; + const auto x42 = radial_eval_alpha + x41; + const auto x43 = x1*x42; + const auto x44 = 2.0*radial_eval_alpha; + const auto x45 = x40 + x44; + const auto x46 = radial_eval_alpha_squared*x10; + const auto x47 = x27 + x37; + const auto x48 = x32*x5; + const auto x49 = x14 + x48; + const auto x50 = x27 + x41; + const auto x51 = 8.0*radial_eval_alpha*x6 + x42*x7; + const auto x52 = x10*x42; + const auto x53 = 7.0*radial_eval_alpha + x28 + x37 + x41; + const auto x54 = -x48; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_3*radial_eval*x*y; - basis_eval[ipt + 1*npts] = sqrt_3*radial_eval*y*z; - basis_eval[ipt + 2*npts] = radial_eval*(-x*x - y*y + 2*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_3*radial_eval*x*z; - basis_eval[ipt + 4*npts] = sqrt_3*radial_eval*(x*x - y*y)/2; + basis_eval[ipt + 0*npts] = radial_eval*x1; + basis_eval[ipt + 1*npts] = radial_eval*x2; + basis_eval[ipt + 2*npts] = x3*x7; + basis_eval[ipt + 3*npts] = radial_eval*x9; + basis_eval[ipt + 4*npts] = sqrt_3*x10*x3; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 1*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_x_eval[ipt + 2*npts] = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_x_eval[ipt + 3*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 4*npts] = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; + basis_x_eval[ipt + 0*npts] = x0*x11; + basis_x_eval[ipt + 1*npts] = x12; + basis_x_eval[ipt + 2*npts] = x13*x17; + basis_x_eval[ipt + 3*npts] = x11*x8; + basis_x_eval[ipt + 4*npts] = sqrt_3*x13*(x14 + x18); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 1*npts] = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_y_eval[ipt + 3*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_y_eval[ipt + 4*npts] = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; + basis_y_eval[ipt + 0*npts] = x19*x21; + basis_y_eval[ipt + 1*npts] = x21*x8; + basis_y_eval[ipt + 2*npts] = x17*x22; + basis_y_eval[ipt + 3*npts] = x12; + basis_y_eval[ipt + 4*npts] = 0.5*x0*(x15 + x18); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_3*radial_eval_alpha*x*y*z; - basis_z_eval[ipt + 1*npts] = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 2*npts] = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - basis_z_eval[ipt + 3*npts] = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 4*npts] = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2; + basis_z_eval[ipt + 0*npts] = x12; + basis_z_eval[ipt + 1*npts] = x0*x24; + basis_z_eval[ipt + 2*npts] = x25*(x16 + x26); + basis_z_eval[ipt + 3*npts] = x19*x24; + basis_z_eval[ipt + 4*npts] = 0.5*radial_eval_alpha*x10*x8; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = sqrt_3*x*y*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 1*npts] = sqrt_3*y*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 2*npts] = -7*radial_eval_alpha*x*x/2 - 7*radial_eval_alpha*y*y/2 + 7*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x/2 - radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z/2 - radial_eval_alpha_squared*y*y*y*y/2 + radial_eval_alpha_squared*y*y*z*z/2 + radial_eval_alpha_squared*z*z*z*z; - basis_lapl_eval[ipt + 3*npts] = sqrt_3*x*z*(7*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 4*npts] = sqrt_3*(7*radial_eval_alpha*x*x - 7*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/2; + basis_lapl_eval[ipt + 0*npts] = x1*x53; + basis_lapl_eval[ipt + 1*npts] = x2*x53; + basis_lapl_eval[ipt + 2*npts] = 0.5*x30*x7 - 0.5*x33 + 0.5*x38*x7 + 0.5*x51 + 0.5*x54; + basis_lapl_eval[ipt + 3*npts] = x53*x9; + basis_lapl_eval[ipt + 4*npts] = x35*(x10*x38 + x33 + x36 + x52 + x54); + @@ -156,16 +217,16 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*x*y; - ang_eval_1 = sqrt_3*radial_eval*y*z; - ang_eval_2 = radial_eval*(-x*x - y*y + 2*z*z)/2; - ang_eval_3 = sqrt_3*radial_eval*x*z; + ang_eval_0 = radial_eval*x1; + ang_eval_1 = radial_eval*x2; + ang_eval_2 = x3*x7; + ang_eval_3 = radial_eval*x9; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_3*radial_eval*(x*x - y*y)/2; + ang_eval_0 = sqrt_3*x10*x3; basis_eval[ipt + 4*npts] = ang_eval_0; @@ -174,18 +235,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_3*y*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_0 = sqrt_3*x*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_0 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_x_1 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_y_1 = sqrt_3*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = sqrt_3*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = x*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_y_2 = y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_z_2 = z*(4*radial_eval - radial_eval_alpha*(x*x + y*y - 2*z*z))/2; - dang_eval_x_3 = sqrt_3*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_3 = sqrt_3*radial_eval_alpha*x*y*z; - dang_eval_z_3 = sqrt_3*x*(radial_eval + radial_eval_alpha*z*z); + dang_eval_x_0 = x0*x11; + dang_eval_y_0 = x19*x21; + dang_eval_z_0 = x12; + dang_eval_x_1 = x12; + dang_eval_y_1 = x21*x8; + dang_eval_z_1 = x0*x24; + dang_eval_x_2 = x13*x17; + dang_eval_y_2 = x17*x22; + dang_eval_z_2 = x25*(x16 + x26); + dang_eval_x_3 = x11*x8; + dang_eval_y_3 = x12; + dang_eval_z_3 = x19*x24; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -199,9 +260,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_3*x*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_y_0 = sqrt_3*y*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_z_0 = sqrt_3*radial_eval_alpha*z*(x*x - y*y)/2; + dang_eval_x_0 = sqrt_3*x13*(x14 + x18); + dang_eval_y_0 = 0.5*x0*(x15 + x18); + dang_eval_z_0 = 0.5*radial_eval_alpha*x10*x8; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp index c5e586b4..8c189f2a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,16 +96,29 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + const auto x0 = 0.25*radial_eval; + const auto x1 = x0*y; + const auto x2 = x*x; + const auto x3 = 3.0*x2; + const auto x4 = y*y; + const auto x5 = -x4; + const auto x6 = radial_eval*z; + const auto x7 = z*z; + const auto x8 = -x2 - x4 + 4.0*x7; + const auto x9 = 0.5*x6; + const auto x10 = 3.0*x4; + const auto x11 = x*x0; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z; - basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; - basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2; - basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + basis_eval[ipt + 0*npts] = sqrt_10*x1*(x3 + x5); + basis_eval[ipt + 1*npts] = sqrt_15*x*x6*y; + basis_eval[ipt + 2*npts] = sqrt_6*x1*x8; + basis_eval[ipt + 3*npts] = -x9*(x10 + x3 - 2.0*x7); + basis_eval[ipt + 4*npts] = sqrt_6*x11*x8; + basis_eval[ipt + 5*npts] = sqrt_15*x9*(x2 + x5); + basis_eval[ipt + 6*npts] = sqrt_10*x11*(-x10 + x2); @@ -111,6 +127,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn @@ -122,18 +140,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - ang_eval_1 = sqrt_15*radial_eval*x*y*z; - ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; + ang_eval_0 = sqrt_10*x1*(x3 + x5); + ang_eval_1 = sqrt_15*x*x6*y; + ang_eval_2 = sqrt_6*x1*x8; + ang_eval_3 = -x9*(x10 + x3 - 2.0*x7); basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2; - ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + ang_eval_0 = sqrt_6*x11*x8; + ang_eval_1 = sqrt_15*x9*(x2 + x5); + ang_eval_2 = sqrt_10*x11*(-x10 + x2); basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp index fe03a72b..bfc1379f 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_3( +__global__ __launch_bounds__(256,2) void collocation_device_shell_to_task_kernel_spherical_gradient_3( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[8][detail::shell_nprim_max + 1]; + __shared__ double coeff[8][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,45 +102,86 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = 0.25*sqrt_10; + const auto x1 = radial_eval*y; + const auto x2 = x*x; + const auto x3 = 3.0*x2; + const auto x4 = y*y; + const auto x5 = -x4; + const auto x6 = x3 + x5; + const auto x7 = sqrt_15*z; + const auto x8 = x7*y; + const auto x9 = radial_eval*x; + const auto x10 = 0.25*sqrt_6; + const auto x11 = z*z; + const auto x12 = -4.0*x11; + const auto x13 = x12 + x4; + const auto x14 = -x13 - x2; + const auto x15 = 0.5*z; + const auto x16 = 3.0*x4; + const auto x17 = -2.0*x11; + const auto x18 = -x16 - x17 - x3; + const auto x19 = 0.5*sqrt_15; + const auto x20 = x19*z; + const auto x21 = x2 + x5; + const auto x22 = -x16; + const auto x23 = x2 + x22; + const auto x24 = x*y; + const auto x25 = x0*x24; + const auto x26 = 6.0*radial_eval; + const auto x27 = 2.0*radial_eval; + const auto x28 = -x27; + const auto x29 = radial_eval_alpha*x14; + const auto x30 = x10*x24*(x28 + x29); + const auto x31 = -x26; + const auto x32 = radial_eval_alpha*x18 + x31; + const auto x33 = radial_eval_alpha*x21; + const auto x34 = radial_eval*(x22 + x3); + const auto x35 = radial_eval_alpha*x0*z; + const auto x36 = x10*z; + const auto x37 = 8.0*radial_eval + x29; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z; - basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; - basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2; - basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + basis_eval[ipt + 0*npts] = x0*x1*x6; + basis_eval[ipt + 1*npts] = x8*x9; + basis_eval[ipt + 2*npts] = x1*x10*x14; + basis_eval[ipt + 3*npts] = radial_eval*x15*x18; + basis_eval[ipt + 4*npts] = x10*x14*x9; + basis_eval[ipt + 5*npts] = radial_eval*x20*x21; + basis_eval[ipt + 6*npts] = x0*x23*x9; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 0*npts] = x25*(radial_eval_alpha*x6 + x26); + basis_x_eval[ipt + 1*npts] = x8*(radial_eval + radial_eval_alpha*x2); + basis_x_eval[ipt + 2*npts] = x30; + basis_x_eval[ipt + 3*npts] = x*x15*x32; + basis_x_eval[ipt + 4*npts] = -x10*(radial_eval*(x13 + x3) - radial_eval_alpha*x14*x2); + basis_x_eval[ipt + 5*npts] = x*x20*(x27 + x33); + basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x2*x23 + x34); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x4*x6 + x34); + basis_y_eval[ipt + 1*npts] = x*x7*(radial_eval + radial_eval_alpha*x4); + basis_y_eval[ipt + 2*npts] = -x10*(radial_eval*(x12 + x16 + x2) - radial_eval_alpha*x14*x4); + basis_y_eval[ipt + 3*npts] = x15*x32*y; + basis_y_eval[ipt + 4*npts] = x30; + basis_y_eval[ipt + 5*npts] = x20*y*(x28 + x33); + basis_y_eval[ipt + 6*npts] = x25*(radial_eval_alpha*x23 + x31); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4; - basis_z_eval[ipt + 1*npts] = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 2*npts] = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_z_eval[ipt + 3*npts] = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; - basis_z_eval[ipt + 4*npts] = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_z_eval[ipt + 5*npts] = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2; - basis_z_eval[ipt + 6*npts] = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 0*npts] = x35*x6*y; + basis_z_eval[ipt + 1*npts] = sqrt_15*x24*(radial_eval + radial_eval_alpha*x11); + basis_z_eval[ipt + 2*npts] = x36*x37*y; + basis_z_eval[ipt + 3*npts] = -1.5*radial_eval*(x17 + x2 + x4) + 0.5*radial_eval_alpha*x11*x18; + basis_z_eval[ipt + 4*npts] = x*x36*x37; + basis_z_eval[ipt + 5*npts] = x19*x21*(radial_eval + radial_eval_alpha*x11); + basis_z_eval[ipt + 6*npts] = x*x23*x35; + + @@ -154,18 +198,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - ang_eval_1 = sqrt_15*radial_eval*x*y*z; - ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; + ang_eval_0 = x0*x1*x6; + ang_eval_1 = x8*x9; + ang_eval_2 = x1*x10*x14; + ang_eval_3 = radial_eval*x15*x18; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2; - ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + ang_eval_0 = x10*x14*x9; + ang_eval_1 = radial_eval*x20*x21; + ang_eval_2 = x0*x23*x9; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; @@ -176,18 +220,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - dang_eval_y_0 = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - dang_eval_z_0 = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4; - dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; + dang_eval_x_0 = x25*(radial_eval_alpha*x6 + x26); + dang_eval_y_0 = x0*(radial_eval_alpha*x4*x6 + x34); + dang_eval_z_0 = x35*x6*y; + dang_eval_x_1 = x8*(radial_eval + radial_eval_alpha*x2); + dang_eval_y_1 = x*x7*(radial_eval + radial_eval_alpha*x4); + dang_eval_z_1 = sqrt_15*x24*(radial_eval + radial_eval_alpha*x11); + dang_eval_x_2 = x30; + dang_eval_y_2 = -x10*(radial_eval*(x12 + x16 + x2) - radial_eval_alpha*x14*x4); + dang_eval_z_2 = x36*x37*y; + dang_eval_x_3 = x*x15*x32; + dang_eval_y_3 = x15*x32*y; + dang_eval_z_3 = -1.5*radial_eval*(x17 + x2 + x4) + 0.5*radial_eval_alpha*x11*x18; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -201,15 +245,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_z_1 = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2; - dang_eval_x_2 = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - dang_eval_y_2 = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - dang_eval_z_2 = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4; + dang_eval_x_0 = -x10*(radial_eval*(x13 + x3) - radial_eval_alpha*x14*x2); + dang_eval_y_0 = x30; + dang_eval_z_0 = x*x36*x37; + dang_eval_x_1 = x*x20*(x27 + x33); + dang_eval_y_1 = x20*y*(x28 + x33); + dang_eval_z_1 = x19*x21*(radial_eval + radial_eval_alpha*x11); + dang_eval_x_2 = x0*(radial_eval_alpha*x2*x23 + x34); + dang_eval_y_2 = x25*(radial_eval_alpha*x23 + x31); + dang_eval_z_2 = x*x23*x35; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp index 04ba8677..b85b6cb4 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_3( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_3( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,99 +111,211 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = 0.25*sqrt_10; + const auto x1 = x0*y; + const auto x2 = x*x; + const auto x3 = 3.0*x2; + const auto x4 = y*y; + const auto x5 = -x4; + const auto x6 = x3 + x5; + const auto x7 = sqrt_15*z; + const auto x8 = x7*y; + const auto x9 = x*x8; + const auto x10 = 0.25*sqrt_6; + const auto x11 = x10*y; + const auto x12 = z*z; + const auto x13 = -4.0*x12; + const auto x14 = x13 + x4; + const auto x15 = -x14 - x2; + const auto x16 = 0.5*z; + const auto x17 = 3.0*x4; + const auto x18 = -2.0*x12; + const auto x19 = -x17 - x18 - x3; + const auto x20 = x*x10; + const auto x21 = 0.5*sqrt_15; + const auto x22 = x21*z; + const auto x23 = x2 + x5; + const auto x24 = x*x0; + const auto x25 = -x17; + const auto x26 = x2 + x25; + const auto x27 = x*x1; + const auto x28 = 6.0*radial_eval; + const auto x29 = radial_eval + radial_eval_alpha*x2; + const auto x30 = x*x11; + const auto x31 = 2.0*radial_eval; + const auto x32 = -x31; + const auto x33 = radial_eval_alpha*x15; + const auto x34 = x30*(x32 + x33); + const auto x35 = x*x16; + const auto x36 = -x28; + const auto x37 = radial_eval_alpha*x19 + x36; + const auto x38 = -x14 - x3; + const auto x39 = x15*x2; + const auto x40 = x*x22; + const auto x41 = radial_eval_alpha*x23; + const auto x42 = x31 + x41; + const auto x43 = x25 + x3; + const auto x44 = radial_eval*x43; + const auto x45 = x2*x26; + const auto x46 = x4*x6; + const auto x47 = radial_eval_alpha*x4; + const auto x48 = radial_eval + x47; + const auto x49 = -x13 - x17 - x2; + const auto x50 = x15*x4; + const auto x51 = x32 + x41; + const auto x52 = radial_eval_alpha*z; + const auto x53 = sqrt_15*y; + const auto x54 = radial_eval_alpha*x12; + const auto x55 = 8.0*radial_eval; + const auto x56 = x33 + x55; + const auto x57 = -x18 - x2 - x4; + const auto x58 = x12*x19; + const auto x59 = x12*x23; + const auto x60 = radial_eval_alpha_squared*x2; + const auto x61 = radial_eval_alpha + x60; + const auto x62 = x6*x61; + const auto x63 = 12.0*radial_eval_alpha; + const auto x64 = x2*x63; + const auto x65 = x28 + x64; + const auto x66 = 3.0*radial_eval_alpha; + const auto x67 = 4.0*radial_eval_alpha; + const auto x68 = x2*x67; + const auto x69 = x31 + x68; + const auto x70 = x15*x61; + const auto x71 = 2.0*radial_eval_alpha; + const auto x72 = x38*x71 + x70; + const auto x73 = x23*x61; + const auto x74 = x43*x71; + const auto x75 = x26*x61 + x74; + const auto x76 = 6.0*radial_eval_alpha; + const auto x77 = radial_eval_alpha*x43; + const auto x78 = radial_eval_alpha_squared*x46 + x77; + const auto x79 = radial_eval_alpha*x49 + radial_eval_alpha_squared*x50; + const auto x80 = radial_eval_alpha*x38 + radial_eval_alpha_squared*x39; + const auto x81 = radial_eval_alpha_squared*x45 + x77; + const auto x82 = x27*z; + const auto x83 = x30*z*(radial_eval_alpha_squared*x15 + x76); + const auto x84 = radial_eval_alpha_squared*x58 - x12*x76 + x36 + x57*x66; + const auto x85 = x10*z; + const auto x86 = 8.0*radial_eval_alpha; + const auto x87 = x12*x71; + const auto x88 = radial_eval_alpha_squared*x59; + const auto x89 = x0*z; + const auto x90 = radial_eval_alpha_squared*x4; + const auto x91 = radial_eval_alpha + x90; + const auto x92 = x6*x91 + x74; + const auto x93 = x15*x91; + const auto x94 = x49*x71 + x93; + const auto x95 = x4*x63; + const auto x96 = x28 + x95; + const auto x97 = x4*x67; + const auto x98 = x31 + x97; + const auto x99 = radial_eval_alpha_squared*x12; + const auto x100 = radial_eval_alpha + x99; + const auto x101 = x100*x6; + const auto x102 = 16.0*radial_eval_alpha*x12 + x100*x15; + const auto x103 = x102 + x55; + const auto x104 = x100*x19 + x57*x76; + const auto x105 = x23*(x100 + x71); + const auto x106 = x100*x26; + const auto x107 = -x95; + const auto x108 = -x97; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z; - basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; - basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2; - basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + basis_eval[ipt + 0*npts] = radial_eval*x1*x6; + basis_eval[ipt + 1*npts] = radial_eval*x9; + basis_eval[ipt + 2*npts] = radial_eval*x11*x15; + basis_eval[ipt + 3*npts] = radial_eval*x16*x19; + basis_eval[ipt + 4*npts] = radial_eval*x15*x20; + basis_eval[ipt + 5*npts] = radial_eval*x22*x23; + basis_eval[ipt + 6*npts] = radial_eval*x24*x26; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 0*npts] = x27*(radial_eval_alpha*x6 + x28); + basis_x_eval[ipt + 1*npts] = x29*x8; + basis_x_eval[ipt + 2*npts] = x34; + basis_x_eval[ipt + 3*npts] = x35*x37; + basis_x_eval[ipt + 4*npts] = x10*(radial_eval*x38 + radial_eval_alpha*x39); + basis_x_eval[ipt + 5*npts] = x40*x42; + basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x45 + x44); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x46 + x44); + basis_y_eval[ipt + 1*npts] = x*x48*x7; + basis_y_eval[ipt + 2*npts] = x10*(radial_eval*x49 + radial_eval_alpha*x50); + basis_y_eval[ipt + 3*npts] = x16*x37*y; + basis_y_eval[ipt + 4*npts] = x34; + basis_y_eval[ipt + 5*npts] = x22*x51*y; + basis_y_eval[ipt + 6*npts] = x27*(radial_eval_alpha*x26 + x36); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4; - basis_z_eval[ipt + 1*npts] = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 2*npts] = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_z_eval[ipt + 3*npts] = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; - basis_z_eval[ipt + 4*npts] = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_z_eval[ipt + 5*npts] = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2; - basis_z_eval[ipt + 6*npts] = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 0*npts] = x1*x52*x6; + basis_z_eval[ipt + 1*npts] = x*x53*(radial_eval + x54); + basis_z_eval[ipt + 2*npts] = x11*x56*z; + basis_z_eval[ipt + 3*npts] = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58; + basis_z_eval[ipt + 4*npts] = x20*x56*z; + basis_z_eval[ipt + 5*npts] = x21*(radial_eval*x23 + radial_eval_alpha*x59); + basis_z_eval[ipt + 6*npts] = x24*x26*x52; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = sqrt_10*y*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4; - basis_xx_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*x*x); - basis_xx_eval[ipt + 2*npts] = sqrt_6*y*(-2*radial_eval - 4*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; - basis_xx_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_xx_eval[ipt + 4*npts] = sqrt_6*x*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 4*z*z))/4; - basis_xx_eval[ipt + 5*npts] = sqrt_15*z*(2*radial_eval + 4*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2; - basis_xx_eval[ipt + 6*npts] = sqrt_10*x*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4; + basis_xx_eval[ipt + 0*npts] = x1*(x62 + x65); + basis_xx_eval[ipt + 1*npts] = x9*(x60 + x66); + basis_xx_eval[ipt + 2*npts] = x11*(x15*x61 - x69); + basis_xx_eval[ipt + 3*npts] = x16*(x19*x61 - x65); + basis_xx_eval[ipt + 4*npts] = x20*(x36 + x72); + basis_xx_eval[ipt + 5*npts] = x22*(x69 + x73); + basis_xx_eval[ipt + 6*npts] = x24*(x28 + x75); // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = sqrt_10*x*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4; - basis_xy_eval[ipt + 1*npts] = sqrt_15*z*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*y*y); - basis_xy_eval[ipt + 2*npts] = sqrt_6*x*(-2*radial_eval - 2*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4; - basis_xy_eval[ipt + 3*npts] = x*y*z*(-12*radial_eval_alpha - radial_eval_alpha_squared*(3*x*x + 3*y*y - 2*z*z))/2; - basis_xy_eval[ipt + 4*npts] = sqrt_6*y*(-2*radial_eval - 2*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4; - basis_xy_eval[ipt + 5*npts] = sqrt_15*radial_eval_alpha_squared*x*y*z*(x*x - y*y)/2; - basis_xy_eval[ipt + 6*npts] = sqrt_10*y*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4; + basis_xy_eval[ipt + 0*npts] = x24*(x28 + x4*x76 + x78); + basis_xy_eval[ipt + 1*npts] = x7*(radial_eval_alpha_squared*x2*x4 + x29 + x47); + basis_xy_eval[ipt + 2*npts] = x20*(x32 - x4*x71 + x79); + basis_xy_eval[ipt + 3*npts] = x35*y*(radial_eval_alpha_squared*x19 - x63); + basis_xy_eval[ipt + 4*npts] = x11*(-x2*x71 + x32 + x80); + basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x23*x40*y; + basis_xy_eval[ipt + 6*npts] = x1*(-x2*x76 + x36 + x81); // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = sqrt_10*x*y*z*(6*radial_eval_alpha + radial_eval_alpha_squared*(3*x*x - y*y))/4; - basis_xz_eval[ipt + 1*npts] = sqrt_15*y*(radial_eval + radial_eval_alpha*x*x + radial_eval_alpha*z*z + radial_eval_alpha_squared*x*x*z*z); - basis_xz_eval[ipt + 2*npts] = sqrt_6*x*y*z*(6*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 4*z*z))/4; - basis_xz_eval[ipt + 3*npts] = x*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y - 3*radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*z*z + 2*radial_eval_alpha_squared*z*z*z*z)/2; - basis_xz_eval[ipt + 4*npts] = sqrt_6*z*(8*radial_eval + 8*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 4*z*z))/4; - basis_xz_eval[ipt + 5*npts] = sqrt_15*x*(2*radial_eval + 2*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - y*y) + radial_eval_alpha_squared*z*z*(x*x - y*y))/2; - basis_xz_eval[ipt + 6*npts] = sqrt_10*z*(3*radial_eval_alpha*(x*x - y*y) + radial_eval_alpha_squared*x*x*(x*x - 3*y*y))/4; + basis_xz_eval[ipt + 0*npts] = x82*(radial_eval_alpha_squared*x6 + x76); + basis_xz_eval[ipt + 1*npts] = x53*(radial_eval_alpha_squared*x12*x2 + x29 + x54); + basis_xz_eval[ipt + 2*npts] = x83; + basis_xz_eval[ipt + 3*npts] = 0.5*x*x84; + basis_xz_eval[ipt + 4*npts] = x85*(x2*x86 + x55 + x80); + basis_xz_eval[ipt + 5*npts] = x*x21*(x42 + x87 + x88); + basis_xz_eval[ipt + 6*npts] = x81*x89; // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = sqrt_10*y*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4; - basis_yy_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*y*y); - basis_yy_eval[ipt + 2*npts] = sqrt_6*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; - basis_yy_eval[ipt + 3*npts] = z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_yy_eval[ipt + 4*npts] = sqrt_6*x*(-2*radial_eval - 4*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 4*z*z))/4; - basis_yy_eval[ipt + 5*npts] = sqrt_15*z*(-2*radial_eval - 4*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2; - basis_yy_eval[ipt + 6*npts] = sqrt_10*x*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4; + basis_yy_eval[ipt + 0*npts] = x1*(x36 + x92); + basis_yy_eval[ipt + 1*npts] = x9*(x66 + x90); + basis_yy_eval[ipt + 2*npts] = x11*(x36 + x94); + basis_yy_eval[ipt + 3*npts] = x16*(x19*x91 - x96); + basis_yy_eval[ipt + 4*npts] = x20*(x15*x91 - x98); + basis_yy_eval[ipt + 5*npts] = x22*(x23*x91 - x98); + basis_yy_eval[ipt + 6*npts] = x24*(x26*x91 - x96); // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = sqrt_10*z*(-3*radial_eval_alpha*(-x*x + y*y) + radial_eval_alpha_squared*y*y*(3*x*x - y*y))/4; - basis_yz_eval[ipt + 1*npts] = sqrt_15*x*(radial_eval + radial_eval_alpha*y*y + radial_eval_alpha*z*z + radial_eval_alpha_squared*y*y*z*z); - basis_yz_eval[ipt + 2*npts] = sqrt_6*z*(8*radial_eval + 8*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 4*z*z))/4; - basis_yz_eval[ipt + 3*npts] = y*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y - 3*radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*z*z + 2*radial_eval_alpha_squared*z*z*z*z)/2; - basis_yz_eval[ipt + 4*npts] = sqrt_6*x*y*z*(6*radial_eval_alpha - radial_eval_alpha_squared*(x*x + y*y - 4*z*z))/4; - basis_yz_eval[ipt + 5*npts] = sqrt_15*y*(-2*radial_eval - 2*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - y*y) + radial_eval_alpha_squared*z*z*(x*x - y*y))/2; - basis_yz_eval[ipt + 6*npts] = sqrt_10*x*y*z*(-6*radial_eval_alpha + radial_eval_alpha_squared*(x*x - 3*y*y))/4; + basis_yz_eval[ipt + 0*npts] = x78*x89; + basis_yz_eval[ipt + 1*npts] = sqrt_15*x*(radial_eval_alpha_squared*x12*x4 + x48 + x54); + basis_yz_eval[ipt + 2*npts] = x85*(x4*x86 + x55 + x79); + basis_yz_eval[ipt + 3*npts] = 0.5*x84*y; + basis_yz_eval[ipt + 4*npts] = x83; + basis_yz_eval[ipt + 5*npts] = x21*y*(x51 - x87 + x88); + basis_yz_eval[ipt + 6*npts] = x82*(radial_eval_alpha_squared*x26 - x76); // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = sqrt_10*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x - y*y)/4; - basis_zz_eval[ipt + 1*npts] = sqrt_15*x*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z); - basis_zz_eval[ipt + 2*npts] = sqrt_6*y*(8*radial_eval + 16*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 4*z*z))/4; - basis_zz_eval[ipt + 3*npts] = z*(12*radial_eval - 6*radial_eval_alpha*(x*x + y*y - 2*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 2*z*z))/2; - basis_zz_eval[ipt + 4*npts] = sqrt_6*x*(8*radial_eval + 16*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 4*z*z))/4; - basis_zz_eval[ipt + 5*npts] = sqrt_15*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2; - basis_zz_eval[ipt + 6*npts] = sqrt_10*x*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - 3*y*y)/4; + basis_zz_eval[ipt + 0*npts] = x1*x101; + basis_zz_eval[ipt + 1*npts] = x9*(x66 + x99); + basis_zz_eval[ipt + 2*npts] = x103*x11; + basis_zz_eval[ipt + 3*npts] = x16*(12.0*radial_eval + x104); + basis_zz_eval[ipt + 4*npts] = x103*x20; + basis_zz_eval[ipt + 5*npts] = x105*x22; + basis_zz_eval[ipt + 6*npts] = x106*x24; + + @@ -216,18 +331,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - ang_eval_1 = sqrt_15*radial_eval*x*y*z; - ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; + ang_eval_0 = radial_eval*x1*x6; + ang_eval_1 = radial_eval*x9; + ang_eval_2 = radial_eval*x11*x15; + ang_eval_3 = radial_eval*x16*x19; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2; - ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + ang_eval_0 = radial_eval*x15*x20; + ang_eval_1 = radial_eval*x22*x23; + ang_eval_2 = radial_eval*x24*x26; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; @@ -238,18 +353,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - dang_eval_y_0 = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - dang_eval_z_0 = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4; - dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; + dang_eval_x_0 = x27*(radial_eval_alpha*x6 + x28); + dang_eval_y_0 = x0*(radial_eval_alpha*x46 + x44); + dang_eval_z_0 = x1*x52*x6; + dang_eval_x_1 = x29*x8; + dang_eval_y_1 = x*x48*x7; + dang_eval_z_1 = x*x53*(radial_eval + x54); + dang_eval_x_2 = x34; + dang_eval_y_2 = x10*(radial_eval*x49 + radial_eval_alpha*x50); + dang_eval_z_2 = x11*x56*z; + dang_eval_x_3 = x35*x37; + dang_eval_y_3 = x16*x37*y; + dang_eval_z_3 = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -263,15 +378,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_z_1 = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2; - dang_eval_x_2 = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - dang_eval_y_2 = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - dang_eval_z_2 = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4; + dang_eval_x_0 = x10*(radial_eval*x38 + radial_eval_alpha*x39); + dang_eval_y_0 = x34; + dang_eval_z_0 = x20*x56*z; + dang_eval_x_1 = x40*x42; + dang_eval_y_1 = x22*x51*y; + dang_eval_z_1 = x21*(radial_eval*x23 + radial_eval_alpha*x59); + dang_eval_x_2 = x0*(radial_eval_alpha*x45 + x44); + dang_eval_y_2 = x27*(radial_eval_alpha*x26 + x36); + dang_eval_z_2 = x24*x26*x52; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp new file mode 100644 index 00000000..a58a8b4e --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp @@ -0,0 +1,514 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_3( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = 0.25*sqrt_10; + const auto x1 = x0*y; + const auto x2 = x*x; + const auto x3 = x2; + const auto x4 = 3.0*x3; + const auto x5 = y*y; + const auto x6 = x5; + const auto x7 = -x6; + const auto x8 = x4 + x7; + const auto x9 = sqrt_15*z; + const auto x10 = x9*y; + const auto x11 = x*x10; + const auto x12 = 0.25*sqrt_6; + const auto x13 = x12*y; + const auto x14 = z*z; + const auto x15 = x14; + const auto x16 = -4.0*x15; + const auto x17 = x16 + x6; + const auto x18 = -x17 - x3; + const auto x19 = 0.5*z; + const auto x20 = 3.0*x6; + const auto x21 = -2.0*x15; + const auto x22 = -x20 - x21 - x4; + const auto x23 = x*x12; + const auto x24 = 0.5*sqrt_15; + const auto x25 = x24*z; + const auto x26 = x3 + x7; + const auto x27 = x*x0; + const auto x28 = -x20; + const auto x29 = x28 + x3; + const auto x30 = x*x1; + const auto x31 = 6.0*radial_eval; + const auto x32 = radial_eval + radial_eval_alpha*x3; + const auto x33 = x*x13; + const auto x34 = 2.0*radial_eval; + const auto x35 = -x34; + const auto x36 = radial_eval_alpha*x18; + const auto x37 = x33*(x35 + x36); + const auto x38 = x*x19; + const auto x39 = -x31; + const auto x40 = radial_eval_alpha*x22 + x39; + const auto x41 = -x17 - x4; + const auto x42 = x18*x3; + const auto x43 = x*x25; + const auto x44 = radial_eval_alpha*x26; + const auto x45 = x34 + x44; + const auto x46 = x28 + x4; + const auto x47 = radial_eval*x46; + const auto x48 = x29*x3; + const auto x49 = x6*x8; + const auto x50 = x*x9; + const auto x51 = radial_eval_alpha*x6; + const auto x52 = radial_eval + x51; + const auto x53 = -x16 - x20 - x3; + const auto x54 = x18*x6; + const auto x55 = x35 + x44; + const auto x56 = radial_eval_alpha*z; + const auto x57 = sqrt_15*y; + const auto x58 = x*x57; + const auto x59 = radial_eval_alpha*x15; + const auto x60 = 8.0*radial_eval; + const auto x61 = x36 + x60; + const auto x62 = -x21 - x3 - x6; + const auto x63 = x15*x22; + const auto x64 = x15*x26; + const auto x65 = radial_eval_alpha_squared*x3; + const auto x66 = radial_eval_alpha + x65; + const auto x67 = x66*x8; + const auto x68 = 12.0*radial_eval_alpha; + const auto x69 = x3*x68; + const auto x70 = x31 + x69; + const auto x71 = 3.0*radial_eval_alpha; + const auto x72 = 4.0*radial_eval_alpha; + const auto x73 = x3*x72; + const auto x74 = x34 + x73; + const auto x75 = x18*x66; + const auto x76 = 2.0*radial_eval_alpha; + const auto x77 = x41*x76 + x75; + const auto x78 = x26*x66; + const auto x79 = x46*x76; + const auto x80 = x29*x66 + x79; + const auto x81 = 6.0*radial_eval_alpha; + const auto x82 = radial_eval_alpha*x46; + const auto x83 = radial_eval_alpha_squared*x49 + x82; + const auto x84 = x3*x6; + const auto x85 = radial_eval_alpha*x53 + radial_eval_alpha_squared*x54; + const auto x86 = radial_eval_alpha*x41 + radial_eval_alpha_squared*x42; + const auto x87 = radial_eval_alpha_squared*x48 + x82; + const auto x88 = x30*z; + const auto x89 = x15*x3; + const auto x90 = x33*z*(radial_eval_alpha_squared*x18 + x81); + const auto x91 = radial_eval_alpha_squared*x63 - x15*x81 + x39 + x62*x71; + const auto x92 = x12*z; + const auto x93 = 8.0*radial_eval_alpha; + const auto x94 = x15*x76; + const auto x95 = radial_eval_alpha_squared*x64; + const auto x96 = x0*z; + const auto x97 = radial_eval_alpha_squared*x6; + const auto x98 = radial_eval_alpha + x97; + const auto x99 = x79 + x8*x98; + const auto x100 = x18*x98; + const auto x101 = x100 + x53*x76; + const auto x102 = x6*x68; + const auto x103 = x102 + x31; + const auto x104 = x6*x72; + const auto x105 = x104 + x34; + const auto x106 = x15*x6; + const auto x107 = radial_eval_alpha_squared*x15; + const auto x108 = radial_eval_alpha + x107; + const auto x109 = x108*x8; + const auto x110 = 16.0*radial_eval_alpha*x15; + const auto x111 = x108*x18 + x110; + const auto x112 = x111 + x60; + const auto x113 = x108*x22 + x62*x81; + const auto x114 = x108*x26; + const auto x115 = x114 + x26*x76; + const auto x116 = x108*x29; + const auto x117 = x107 + x97; + const auto x118 = -x73; + const auto x119 = -x102; + const auto x120 = -x69; + const auto x121 = x119 + x120; + const auto x122 = -x104; + const auto x123 = x122 + x26*x98 + x73 + x78; + const auto x124 = 3.0*radial_eval_alpha_squared; + const auto x125 = x*(radial_eval_alpha_cubed*(x*x) + x124); + const auto x126 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; + const auto x127 = x126*x8; + const auto x128 = radial_eval_alpha_cubed*x15 + radial_eval_alpha_squared; + const auto x129 = x128*x8; + const auto x130 = 2.0*x; + const auto x131 = radial_eval_alpha_squared*x130; + const auto x132 = 6.0*x; + const auto x133 = 24.0*radial_eval_alpha; + const auto x134 = x*x133 + 18.0*x*x66 + x108*x132 + x132*x98; + const auto x135 = 4.0*radial_eval_alpha_squared; + const auto x136 = x*x93; + const auto x137 = 16.0*radial_eval_alpha_squared; + const auto x138 = x132*x66; + const auto x139 = x130*x98; + const auto x140 = x108*x130; + const auto x141 = x126*x18; + const auto x142 = x128*x18; + const auto x143 = x125*x18; + const auto x144 = 12.0*radial_eval_alpha_squared; + const auto x145 = x110 - x135*x84; + const auto x146 = x126*x26; + const auto x147 = x128*x26; + const auto x148 = x46*x98; + const auto x149 = x46*x66; + const auto x150 = x126*x29; + const auto x151 = x128*x29; + const auto x152 = x144*x84; + const auto x153 = x108*x46 + x119 + x69; + const auto x154 = y*(radial_eval_alpha_cubed*(y*y) + x124); + const auto x155 = radial_eval_alpha_cubed*x3 + radial_eval_alpha_squared; + const auto x156 = x155*x8; + const auto x157 = x65 + x81; + const auto x158 = x154*x18; + const auto x159 = x155*x18; + const auto x160 = x133*y; + const auto x161 = 6.0*y; + const auto x162 = x161*x66; + const auto x163 = 18.0*x98*y; + const auto x164 = x108*x161; + const auto x165 = 2.0*y; + const auto x166 = radial_eval_alpha_squared*x165; + const auto x167 = -x108*x165 - x161*x98 - x165*x66 - x93*y; + const auto x168 = x155*x26; + const auto x169 = x155*x29; + const auto x170 = x144*z; + const auto x171 = 2.0*radial_eval_alpha_squared*z; + const auto x172 = x171*x46; + const auto x173 = z*(radial_eval_alpha_cubed*(z*z) + x124); + const auto x174 = x135*z; + const auto x175 = 8.0*z; + const auto x176 = 24.0*x108*z + x141*z + x159*z + x173*x18 + x175*x66 + x175*x98 + 32.0*x56; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x1*x8; + basis_eval[ipt + 1*npts] = radial_eval*x11; + basis_eval[ipt + 2*npts] = radial_eval*x13*x18; + basis_eval[ipt + 3*npts] = radial_eval*x19*x22; + basis_eval[ipt + 4*npts] = radial_eval*x18*x23; + basis_eval[ipt + 5*npts] = radial_eval*x25*x26; + basis_eval[ipt + 6*npts] = radial_eval*x27*x29; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x30*(radial_eval_alpha*x8 + x31); + basis_x_eval[ipt + 1*npts] = x10*x32; + basis_x_eval[ipt + 2*npts] = x37; + basis_x_eval[ipt + 3*npts] = x38*x40; + basis_x_eval[ipt + 4*npts] = x12*(radial_eval*x41 + radial_eval_alpha*x42); + basis_x_eval[ipt + 5*npts] = x43*x45; + basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x48 + x47); + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x49 + x47); + basis_y_eval[ipt + 1*npts] = x50*x52; + basis_y_eval[ipt + 2*npts] = x12*(radial_eval*x53 + radial_eval_alpha*x54); + basis_y_eval[ipt + 3*npts] = x19*x40*y; + basis_y_eval[ipt + 4*npts] = x37; + basis_y_eval[ipt + 5*npts] = x25*x55*y; + basis_y_eval[ipt + 6*npts] = x30*(radial_eval_alpha*x29 + x39); + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x1*x56*x8; + basis_z_eval[ipt + 1*npts] = x58*(radial_eval + x59); + basis_z_eval[ipt + 2*npts] = x13*x61*z; + basis_z_eval[ipt + 3*npts] = 1.5*radial_eval*x62 + 0.5*radial_eval_alpha*x63; + basis_z_eval[ipt + 4*npts] = x23*x61*z; + basis_z_eval[ipt + 5*npts] = x24*(radial_eval*x26 + radial_eval_alpha*x64); + basis_z_eval[ipt + 6*npts] = x27*x29*x56; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x1*(x67 + x70); + basis_xx_eval[ipt + 1*npts] = x11*(x65 + x71); + basis_xx_eval[ipt + 2*npts] = x13*(x18*x66 - x74); + basis_xx_eval[ipt + 3*npts] = x19*(x22*x66 - x70); + basis_xx_eval[ipt + 4*npts] = x23*(x39 + x77); + basis_xx_eval[ipt + 5*npts] = x25*(x74 + x78); + basis_xx_eval[ipt + 6*npts] = x27*(x31 + x80); + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x27*(x31 + x6*x81 + x83); + basis_xy_eval[ipt + 1*npts] = x9*(radial_eval_alpha_squared*x84 + x32 + x51); + basis_xy_eval[ipt + 2*npts] = x23*(x35 - x6*x76 + x85); + basis_xy_eval[ipt + 3*npts] = x38*y*(radial_eval_alpha_squared*x22 - x68); + basis_xy_eval[ipt + 4*npts] = x13*(-x3*x76 + x35 + x86); + basis_xy_eval[ipt + 5*npts] = radial_eval_alpha_squared*x26*x43*y; + basis_xy_eval[ipt + 6*npts] = x1*(-x3*x81 + x39 + x87); + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x88*(radial_eval_alpha_squared*x8 + x81); + basis_xz_eval[ipt + 1*npts] = x57*(radial_eval_alpha_squared*x89 + x32 + x59); + basis_xz_eval[ipt + 2*npts] = x90; + basis_xz_eval[ipt + 3*npts] = 0.5*x*x91; + basis_xz_eval[ipt + 4*npts] = x92*(x3*x93 + x60 + x86); + basis_xz_eval[ipt + 5*npts] = x*x24*(x45 + x94 + x95); + basis_xz_eval[ipt + 6*npts] = x87*x96; + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x1*(x39 + x99); + basis_yy_eval[ipt + 1*npts] = x11*(x71 + x97); + basis_yy_eval[ipt + 2*npts] = x13*(x101 + x39); + basis_yy_eval[ipt + 3*npts] = x19*(-x103 + x22*x98); + basis_yy_eval[ipt + 4*npts] = x23*(-x105 + x18*x98); + basis_yy_eval[ipt + 5*npts] = x25*(-x105 + x26*x98); + basis_yy_eval[ipt + 6*npts] = x27*(-x103 + x29*x98); + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x83*x96; + basis_yz_eval[ipt + 1*npts] = sqrt_15*x*(radial_eval_alpha_squared*x106 + x52 + x59); + basis_yz_eval[ipt + 2*npts] = x92*(x6*x93 + x60 + x85); + basis_yz_eval[ipt + 3*npts] = 0.5*x91*y; + basis_yz_eval[ipt + 4*npts] = x90; + basis_yz_eval[ipt + 5*npts] = x24*y*(x55 - x94 + x95); + basis_yz_eval[ipt + 6*npts] = x88*(radial_eval_alpha_squared*x29 - x81); + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x1*x109; + basis_zz_eval[ipt + 1*npts] = x11*(x107 + x71); + basis_zz_eval[ipt + 2*npts] = x112*x13; + basis_zz_eval[ipt + 3*npts] = x19*(12.0*radial_eval + x113); + basis_zz_eval[ipt + 4*npts] = x112*x23; + basis_zz_eval[ipt + 5*npts] = x115*x25; + basis_zz_eval[ipt + 6*npts] = x116*x27; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x1*(x109 + x67 + x69 + x99); + basis_lapl_eval[ipt + 1*npts] = x11*(9.0*radial_eval_alpha + x117 + x65); + basis_lapl_eval[ipt + 2*npts] = x13*(x101 + x111 + x118 + x75); + basis_lapl_eval[ipt + 3*npts] = x19*(x113 + x121 + x22*x66 + x22*x98); + basis_lapl_eval[ipt + 4*npts] = x23*(x100 + x111 + x122 + x77); + basis_lapl_eval[ipt + 5*npts] = x25*(x115 + x123); + basis_lapl_eval[ipt + 6*npts] = x27*(x116 + x119 + x29*x98 + x80); + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x1*(x*x127 + x*x129 + x125*x8 + x131*x46 + x134); + basis_lapl_x_eval[ipt + 1*npts] = x10*(x*x125 + x117 + x126*x2 + x128*x2 + x135*x3 + 3.0*x66 + x81); + basis_lapl_x_eval[ipt + 2*npts] = x13*(x*x137*x15 + x*x141 + x*x142 + x131*x53 - x136 - x138 - x139 - x140 + x143); + basis_lapl_x_eval[ipt + 3*npts] = x19*(6.0*radial_eval_alpha_squared*x*x62 + x*x126*x22 + x*x128*x22 - x*x144*x6 + x125*x22 - x134); + basis_lapl_x_eval[ipt + 4*npts] = x12*(x*x143 + x108*x41 + x120 + x122 + x137*x89 + x141*x2 + x142*x2 + x145 + 3.0*x41*x66 + x41*x98); + basis_lapl_x_eval[ipt + 5*npts] = x25*(-x*x135*x6 + x*x146 + x*x147 + x125*x26 + x131*x26 + x136 + x138 + x139 + x140); + basis_lapl_x_eval[ipt + 6*npts] = x0*(x*x125*x29 + x148 + 3.0*x149 + x150*x2 + x151*x2 - x152 + x153); + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x0*(x129*x5 + 3.0*x148 + x149 + x152 + x153 + x154*x8*y + x156*x5); + basis_lapl_y_eval[ipt + 1*npts] = x50*(x107 + x128*x5 + x135*x6 + x154*y + x155*x5 + x157 + 3.0*x98); + basis_lapl_y_eval[ipt + 2*npts] = x12*(x106*x137 + x108*x53 + x118 + x119 + x142*x5 + x145 + x158*y + x159*x5 + x53*x66 + 3.0*x53*x98); + basis_lapl_y_eval[ipt + 3*npts] = -x19*(-6.0*radial_eval_alpha_squared*x62*y - x128*x22*y + x144*x3*y - x154*x22 - x155*x22*y + x160 + x162 + x163 + x164); + basis_lapl_y_eval[ipt + 4*npts] = x23*(x137*x15*y + x142*y + x158 + x159*y + x166*x41 + x167); + basis_lapl_y_eval[ipt + 5*npts] = x25*(x135*x3*y + x147*y + x154*x26 + x166*x26 + x167 + x168*y); + basis_lapl_y_eval[ipt + 6*npts] = x27*(x151*y + x154*x29 - x160 - x162 - x163 - x164 + x166*x46 + x169*y); + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x1*(x127*z + x156*z + x170*x3 + x172 + x173*x8); + basis_lapl_z_eval[ipt + 1*npts] = x58*(3.0*x108 + x126*x14 + x135*x15 + x14*x155 + x157 + x173*z + x97); + basis_lapl_z_eval[ipt + 2*npts] = x13*(x171*x53 - x174*x3 + x176); + basis_lapl_z_eval[ipt + 3*npts] = -0.5*x106*x144 + 4.5*x108*x62 + 0.5*x121 + 0.5*x126*x14*x22 + 0.5*x133*x15 + 0.5*x14*x155*x22 - 0.5*x144*x89 + 0.5*x173*x22*z + 1.5*x62*x66 + 1.5*x62*x98; + basis_lapl_z_eval[ipt + 4*npts] = x23*(x171*x41 - x174*x6 + x176); + basis_lapl_z_eval[ipt + 5*npts] = x24*(-x106*x135 + 3.0*x114 + x123 + x135*x89 + x14*x146 + x14*x168 + x173*x26*z); + basis_lapl_z_eval[ipt + 6*npts] = x27*(x150*z + x169*z - x170*x6 + x172 + x173*x29); + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x1*x8; + ang_eval_1 = radial_eval*x11; + ang_eval_2 = radial_eval*x13*x18; + ang_eval_3 = radial_eval*x19*x22; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = radial_eval*x18*x23; + ang_eval_1 = radial_eval*x25*x26; + ang_eval_2 = radial_eval*x27*x29; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x30*(radial_eval_alpha*x8 + x31); + dang_eval_y_0 = x0*(radial_eval_alpha*x49 + x47); + dang_eval_z_0 = x1*x56*x8; + dang_eval_x_1 = x10*x32; + dang_eval_y_1 = x50*x52; + dang_eval_z_1 = x58*(radial_eval + x59); + dang_eval_x_2 = x37; + dang_eval_y_2 = x12*(radial_eval*x53 + radial_eval_alpha*x54); + dang_eval_z_2 = x13*x61*z; + dang_eval_x_3 = x38*x40; + dang_eval_y_3 = x19*x40*y; + dang_eval_z_3 = 1.5*radial_eval*x62 + 0.5*radial_eval_alpha*x63; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = x12*(radial_eval*x41 + radial_eval_alpha*x42); + dang_eval_y_0 = x37; + dang_eval_z_0 = x23*x61*z; + dang_eval_x_1 = x43*x45; + dang_eval_y_1 = x25*x55*y; + dang_eval_z_1 = x24*(radial_eval*x26 + radial_eval_alpha*x64); + dang_eval_x_2 = x0*(radial_eval_alpha*x48 + x47); + dang_eval_y_2 = x30*(radial_eval_alpha*x29 + x39); + dang_eval_z_2 = x27*x29*x56; + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp index 18dff71c..d5f8f3af 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_3( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_3( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,54 +106,166 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = 0.25*sqrt_10; + const auto x1 = x0*y; + const auto x2 = x*x; + const auto x3 = 3.0*x2; + const auto x4 = y*y; + const auto x5 = -x4; + const auto x6 = x3 + x5; + const auto x7 = sqrt_15*z; + const auto x8 = x7*y; + const auto x9 = x*x8; + const auto x10 = 0.25*sqrt_6; + const auto x11 = x10*y; + const auto x12 = z*z; + const auto x13 = -4.0*x12; + const auto x14 = x13 + x4; + const auto x15 = -x14 - x2; + const auto x16 = 0.5*z; + const auto x17 = 3.0*x4; + const auto x18 = -2.0*x12; + const auto x19 = -x17 - x18 - x3; + const auto x20 = x*x10; + const auto x21 = 0.5*sqrt_15; + const auto x22 = x21*z; + const auto x23 = x2 + x5; + const auto x24 = x*x0; + const auto x25 = -x17; + const auto x26 = x2 + x25; + const auto x27 = x*x1; + const auto x28 = 6.0*radial_eval; + const auto x29 = radial_eval + radial_eval_alpha*x2; + const auto x30 = x*x11; + const auto x31 = 2.0*radial_eval; + const auto x32 = -x31; + const auto x33 = radial_eval_alpha*x15; + const auto x34 = x30*(x32 + x33); + const auto x35 = x*x16; + const auto x36 = -x28; + const auto x37 = radial_eval_alpha*x19 + x36; + const auto x38 = -x14 - x3; + const auto x39 = x15*x2; + const auto x40 = x*x22; + const auto x41 = radial_eval_alpha*x23; + const auto x42 = x31 + x41; + const auto x43 = x25 + x3; + const auto x44 = radial_eval*x43; + const auto x45 = x2*x26; + const auto x46 = x4*x6; + const auto x47 = radial_eval_alpha*x4; + const auto x48 = radial_eval + x47; + const auto x49 = -x13 - x17 - x2; + const auto x50 = x15*x4; + const auto x51 = x32 + x41; + const auto x52 = radial_eval_alpha*z; + const auto x53 = sqrt_15*y; + const auto x54 = radial_eval_alpha*x12; + const auto x55 = 8.0*radial_eval; + const auto x56 = x33 + x55; + const auto x57 = -x18 - x2 - x4; + const auto x58 = x12*x19; + const auto x59 = x12*x23; + const auto x60 = radial_eval_alpha_squared*x2; + const auto x61 = radial_eval_alpha + x60; + const auto x62 = x6*x61; + const auto x63 = 12.0*radial_eval_alpha; + const auto x64 = x2*x63; + const auto x65 = x28 + x64; + const auto x66 = 3.0*radial_eval_alpha; + const auto x67 = 4.0*radial_eval_alpha; + const auto x68 = x2*x67; + const auto x69 = x31 + x68; + const auto x70 = x15*x61; + const auto x71 = 2.0*radial_eval_alpha; + const auto x72 = x38*x71 + x70; + const auto x73 = x23*x61; + const auto x74 = x43*x71; + const auto x75 = x26*x61 + x74; + const auto x76 = 6.0*radial_eval_alpha; + const auto x77 = radial_eval_alpha*x43; + const auto x78 = radial_eval_alpha_squared*x46 + x77; + const auto x79 = radial_eval_alpha*x49 + radial_eval_alpha_squared*x50; + const auto x80 = radial_eval_alpha*x38 + radial_eval_alpha_squared*x39; + const auto x81 = radial_eval_alpha_squared*x45 + x77; + const auto x82 = x27*z; + const auto x83 = x30*z*(radial_eval_alpha_squared*x15 + x76); + const auto x84 = radial_eval_alpha_squared*x58 - x12*x76 + x36 + x57*x66; + const auto x85 = x10*z; + const auto x86 = 8.0*radial_eval_alpha; + const auto x87 = x12*x71; + const auto x88 = radial_eval_alpha_squared*x59; + const auto x89 = x0*z; + const auto x90 = radial_eval_alpha_squared*x4; + const auto x91 = radial_eval_alpha + x90; + const auto x92 = x6*x91 + x74; + const auto x93 = x15*x91; + const auto x94 = x49*x71 + x93; + const auto x95 = x4*x63; + const auto x96 = x28 + x95; + const auto x97 = x4*x67; + const auto x98 = x31 + x97; + const auto x99 = radial_eval_alpha_squared*x12; + const auto x100 = radial_eval_alpha + x99; + const auto x101 = x100*x6; + const auto x102 = 16.0*radial_eval_alpha*x12 + x100*x15; + const auto x103 = x102 + x55; + const auto x104 = x100*x19 + x57*x76; + const auto x105 = x23*(x100 + x71); + const auto x106 = x100*x26; + const auto x107 = -x95; + const auto x108 = -x97; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - basis_eval[ipt + 1*npts] = sqrt_15*radial_eval*x*y*z; - basis_eval[ipt + 2*npts] = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 3*npts] = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; - basis_eval[ipt + 4*npts] = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - basis_eval[ipt + 5*npts] = sqrt_15*radial_eval*z*(x*x - y*y)/2; - basis_eval[ipt + 6*npts] = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + basis_eval[ipt + 0*npts] = radial_eval*x1*x6; + basis_eval[ipt + 1*npts] = radial_eval*x9; + basis_eval[ipt + 2*npts] = radial_eval*x11*x15; + basis_eval[ipt + 3*npts] = radial_eval*x16*x19; + basis_eval[ipt + 4*npts] = radial_eval*x15*x20; + basis_eval[ipt + 5*npts] = radial_eval*x22*x23; + basis_eval[ipt + 6*npts] = radial_eval*x24*x26; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - basis_x_eval[ipt + 1*npts] = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - basis_x_eval[ipt + 2*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 3*npts] = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_x_eval[ipt + 4*npts] = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - basis_x_eval[ipt + 5*npts] = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - basis_x_eval[ipt + 6*npts] = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; + basis_x_eval[ipt + 0*npts] = x27*(radial_eval_alpha*x6 + x28); + basis_x_eval[ipt + 1*npts] = x29*x8; + basis_x_eval[ipt + 2*npts] = x34; + basis_x_eval[ipt + 3*npts] = x35*x37; + basis_x_eval[ipt + 4*npts] = x10*(radial_eval*x38 + radial_eval_alpha*x39); + basis_x_eval[ipt + 5*npts] = x40*x42; + basis_x_eval[ipt + 6*npts] = x0*(radial_eval_alpha*x45 + x44); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - basis_y_eval[ipt + 1*npts] = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - basis_y_eval[ipt + 2*npts] = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 3*npts] = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - basis_y_eval[ipt + 4*npts] = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_y_eval[ipt + 5*npts] = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - basis_y_eval[ipt + 6*npts] = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; + basis_y_eval[ipt + 0*npts] = x0*(radial_eval_alpha*x46 + x44); + basis_y_eval[ipt + 1*npts] = x*x48*x7; + basis_y_eval[ipt + 2*npts] = x10*(radial_eval*x49 + radial_eval_alpha*x50); + basis_y_eval[ipt + 3*npts] = x16*x37*y; + basis_y_eval[ipt + 4*npts] = x34; + basis_y_eval[ipt + 5*npts] = x22*x51*y; + basis_y_eval[ipt + 6*npts] = x27*(radial_eval_alpha*x26 + x36); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4; - basis_z_eval[ipt + 1*npts] = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - basis_z_eval[ipt + 2*npts] = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_z_eval[ipt + 3*npts] = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; - basis_z_eval[ipt + 4*npts] = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - basis_z_eval[ipt + 5*npts] = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2; - basis_z_eval[ipt + 6*npts] = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4; + basis_z_eval[ipt + 0*npts] = x1*x52*x6; + basis_z_eval[ipt + 1*npts] = x*x53*(radial_eval + x54); + basis_z_eval[ipt + 2*npts] = x11*x56*z; + basis_z_eval[ipt + 3*npts] = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58; + basis_z_eval[ipt + 4*npts] = x20*x56*z; + basis_z_eval[ipt + 5*npts] = x21*(radial_eval*x23 + radial_eval_alpha*x59); + basis_z_eval[ipt + 6*npts] = x24*x26*x52; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = sqrt_10*y*(27*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*x*x + 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/4; - basis_lapl_eval[ipt + 1*npts] = sqrt_15*x*y*z*(9*radial_eval_alpha + radial_eval_alpha_squared*x*x + radial_eval_alpha_squared*y*y + radial_eval_alpha_squared*z*z); - basis_lapl_eval[ipt + 2*npts] = sqrt_6*y*(-9*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + 36*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y + 3*radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4; - basis_lapl_eval[ipt + 3*npts] = z*(-27*radial_eval_alpha*x*x - 27*radial_eval_alpha*y*y + 18*radial_eval_alpha*z*z - 3*radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z + 2*radial_eval_alpha_squared*z*z*z*z)/2; - basis_lapl_eval[ipt + 4*npts] = sqrt_6*x*(-9*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + 36*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y + 3*radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4; - basis_lapl_eval[ipt + 5*npts] = sqrt_15*z*(9*radial_eval_alpha*x*x - 9*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/2; - basis_lapl_eval[ipt + 6*npts] = sqrt_10*x*(9*radial_eval_alpha*x*x - 27*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y - 3*radial_eval_alpha_squared*y*y*z*z)/4; + basis_lapl_eval[ipt + 0*npts] = x1*(x101 + x62 + x64 + x92); + basis_lapl_eval[ipt + 1*npts] = x9*(9.0*radial_eval_alpha + x60 + x90 + x99); + basis_lapl_eval[ipt + 2*npts] = x11*(x102 - x68 + x70 + x94); + basis_lapl_eval[ipt + 3*npts] = x16*(x104 + x107 + x19*x61 + x19*x91 - x64); + basis_lapl_eval[ipt + 4*npts] = x20*(x102 + x108 + x72 + x93); + basis_lapl_eval[ipt + 5*npts] = x22*(x105 + x108 + x23*x91 + x68 + x73); + basis_lapl_eval[ipt + 6*npts] = x24*(x106 + x107 + x26*x91 + x75); + @@ -166,18 +281,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_10*radial_eval*y*(3*x*x - y*y)/4; - ang_eval_1 = sqrt_15*radial_eval*x*y*z; - ang_eval_2 = sqrt_6*radial_eval*y*(-x*x - y*y + 4*z*z)/4; - ang_eval_3 = radial_eval*z*(-3*x*x - 3*y*y + 2*z*z)/2; + ang_eval_0 = radial_eval*x1*x6; + ang_eval_1 = radial_eval*x9; + ang_eval_2 = radial_eval*x11*x15; + ang_eval_3 = radial_eval*x16*x19; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = sqrt_6*radial_eval*x*(-x*x - y*y + 4*z*z)/4; - ang_eval_1 = sqrt_15*radial_eval*z*(x*x - y*y)/2; - ang_eval_2 = sqrt_10*radial_eval*x*(x*x - 3*y*y)/4; + ang_eval_0 = radial_eval*x15*x20; + ang_eval_1 = radial_eval*x22*x23; + ang_eval_2 = radial_eval*x24*x26; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; @@ -188,18 +303,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_10*x*y*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - dang_eval_y_0 = sqrt_10*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - dang_eval_z_0 = sqrt_10*radial_eval_alpha*y*z*(3*x*x - y*y)/4; - dang_eval_x_1 = sqrt_15*y*z*(radial_eval + radial_eval_alpha*x*x); - dang_eval_y_1 = sqrt_15*x*z*(radial_eval + radial_eval_alpha*y*y); - dang_eval_z_1 = sqrt_15*x*y*(radial_eval + radial_eval_alpha*z*z); - dang_eval_x_2 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_y_2 = sqrt_6*(-radial_eval*(x*x + 3*y*y - 4*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 4*z*z))/4; - dang_eval_z_2 = sqrt_6*y*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_3 = x*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_y_3 = y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 2*z*z))/2; - dang_eval_z_3 = -3*radial_eval*(x*x + y*y - 2*z*z)/2 - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z)/2; + dang_eval_x_0 = x27*(radial_eval_alpha*x6 + x28); + dang_eval_y_0 = x0*(radial_eval_alpha*x46 + x44); + dang_eval_z_0 = x1*x52*x6; + dang_eval_x_1 = x29*x8; + dang_eval_y_1 = x*x48*x7; + dang_eval_z_1 = x*x53*(radial_eval + x54); + dang_eval_x_2 = x34; + dang_eval_y_2 = x10*(radial_eval*x49 + radial_eval_alpha*x50); + dang_eval_z_2 = x11*x56*z; + dang_eval_x_3 = x35*x37; + dang_eval_y_3 = x16*x37*y; + dang_eval_z_3 = 1.5*radial_eval*x57 + 0.5*radial_eval_alpha*x58; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -213,15 +328,15 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_6*(-radial_eval*(3*x*x + y*y - 4*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 4*z*z))/4; - dang_eval_y_0 = sqrt_6*x*y*(-2*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_z_0 = sqrt_6*x*z*(8*radial_eval - radial_eval_alpha*(x*x + y*y - 4*z*z))/4; - dang_eval_x_1 = sqrt_15*x*z*(2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_y_1 = sqrt_15*y*z*(-2*radial_eval + radial_eval_alpha*(x*x - y*y))/2; - dang_eval_z_1 = sqrt_15*(radial_eval + radial_eval_alpha*z*z)*(x*x - y*y)/2; - dang_eval_x_2 = sqrt_10*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - dang_eval_y_2 = sqrt_10*x*y*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - dang_eval_z_2 = sqrt_10*radial_eval_alpha*x*z*(x*x - 3*y*y)/4; + dang_eval_x_0 = x10*(radial_eval*x38 + radial_eval_alpha*x39); + dang_eval_y_0 = x34; + dang_eval_z_0 = x20*x56*z; + dang_eval_x_1 = x40*x42; + dang_eval_y_1 = x22*x51*y; + dang_eval_z_1 = x21*(radial_eval*x23 + radial_eval_alpha*x59); + dang_eval_x_2 = x0*(radial_eval_alpha*x45 + x44); + dang_eval_y_2 = x27*(radial_eval_alpha*x26 + x36); + dang_eval_z_2 = x24*x26*x52; basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp index a7a11723..1f48ecb0 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -64,7 +68,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_eval = task->bf + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -93,18 +96,37 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel } - + // Common Subexpressions + const auto x0 = 0.5*radial_eval*x*y; + const auto x1 = x*x; + const auto x2 = y*y; + const auto x3 = -x2; + const auto x4 = 0.25*radial_eval; + const auto x5 = x4*z; + const auto x6 = x5*y; + const auto x7 = 3.0*x1; + const auto x8 = z*z; + const auto x9 = 3.0*x2; + const auto x10 = -x7 + 4.0*x8 - x9; + const auto x11 = 0.125*radial_eval; + const auto x12 = x*x*x*x; + const auto x13 = y*y*y*y; + const auto x14 = 6.0*x1*x2; + const auto x15 = x1*x8; + const auto x16 = x2*x8; + const auto x17 = x*x5; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; - basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 0*npts] = sqrt_35*x0*(x1 + x3); + basis_eval[ipt + 1*npts] = sqrt_70*x6*(x3 + x7); + basis_eval[ipt + 2*npts] = -sqrt_5*x0*(x1 + x2 - 6.0*x8); + basis_eval[ipt + 3*npts] = sqrt_10*x10*x6; + basis_eval[ipt + 4*npts] = x11*(3.0*x12 + 3.0*x13 + x14 - 24.0*x15 - 24.0*x16 + 8.0*(z*z*z*z)); + basis_eval[ipt + 5*npts] = sqrt_10*x10*x17; + basis_eval[ipt + 6*npts] = -sqrt_5*x4*(x12 - x13 - 6.0*x15 + 6.0*x16); + basis_eval[ipt + 7*npts] = sqrt_70*x17*(x1 - x9); + basis_eval[ipt + 8*npts] = sqrt_35*x11*(x12 + x13 - x14); @@ -113,6 +135,8 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel + + #if 0 // Evaluate the angular part of bfn @@ -124,25 +148,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_0 = sqrt_35*x0*(x1 + x3); + ang_eval_1 = sqrt_70*x6*(x3 + x7); + ang_eval_2 = -sqrt_5*x0*(x1 + x2 - 6.0*x8); + ang_eval_3 = sqrt_10*x10*x6; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + ang_eval_0 = x11*(3.0*x12 + 3.0*x13 + x14 - 24.0*x15 - 24.0*x16 + 8.0*(z*z*z*z)); + ang_eval_1 = sqrt_10*x10*x17; + ang_eval_2 = -sqrt_5*x4*(x12 - x13 - 6.0*x15 + 6.0*x16); + ang_eval_3 = sqrt_70*x17*(x1 - x9); basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + ang_eval_0 = sqrt_35*x11*(x12 + x13 - x14); basis_eval[ipt + 8*npts] = ang_eval_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp index 096c3db5..c826b10f 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_gradient.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_gradient_4( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_gradient_4( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -67,7 +71,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - // Loop over points in task // Assign each point to separate thread within the warp #pragma unroll 1 @@ -99,53 +102,113 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; - + // Common Subexpressions + const auto x0 = 0.5*y; + const auto x1 = sqrt_35*x0; + const auto x2 = radial_eval*x; + const auto x3 = x*x; + const auto x4 = y*y; + const auto x5 = -x4; + const auto x6 = x3 + x5; + const auto x7 = 0.25*z; + const auto x8 = sqrt_70*x7; + const auto x9 = radial_eval*y; + const auto x10 = 3.0*x3; + const auto x11 = x10 + x5; + const auto x12 = sqrt_5*x0; + const auto x13 = z*z; + const auto x14 = -6.0*x13; + const auto x15 = x14 + x4; + const auto x16 = -x15 - x3; + const auto x17 = sqrt_10*x7; + const auto x18 = -4.0*x13; + const auto x19 = 3.0*x4; + const auto x20 = x18 + x19; + const auto x21 = -x10 - x20; + const auto x22 = 0.125*radial_eval; + const auto x23 = x*x*x*x; + const auto x24 = y*y*y*y; + const auto x25 = 6.0*x3*x4; + const auto x26 = x13*x3; + const auto x27 = x13*x4; + const auto x28 = 3.0*x23 + 3.0*x24 + x25 - 24.0*x26 - 24.0*x27 + 8.0*(z*z*z*z); + const auto x29 = 0.25*sqrt_5; + const auto x30 = -x23 + x24 + 6.0*x26 - 6.0*x27; + const auto x31 = -x19; + const auto x32 = x3 + x31; + const auto x33 = x23 + x24 - x25; + const auto x34 = radial_eval*x11; + const auto x35 = x*y; + const auto x36 = x35*x8; + const auto x37 = 6.0*radial_eval; + const auto x38 = -x37; + const auto x39 = x17*x35*(radial_eval_alpha*x21 + x38); + const auto x40 = 12.0*radial_eval; + const auto x41 = x*x*x; + const auto x42 = radial_eval_alpha*x; + const auto x43 = 4.0*radial_eval; + const auto x44 = 3.0*x; + const auto x45 = radial_eval*(x10 + x31); + const auto x46 = 0.125*sqrt_35; + const auto x47 = 0.5*x; + const auto x48 = radial_eval*x32; + const auto x49 = y*y*y; + const auto x50 = radial_eval_alpha*y; + const auto x51 = 3.0*y; + const auto x52 = 0.25*y; + const auto x53 = -radial_eval*(x10 - 12.0*x13 + x19) + radial_eval_alpha*x13*x21; + const auto x54 = 3.0*z; + const auto x55 = radial_eval_alpha*z; + const auto x56 = 0.25*x; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; - basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 0*npts] = x1*x2*x6; + basis_eval[ipt + 1*npts] = x11*x8*x9; + basis_eval[ipt + 2*npts] = x12*x16*x2; + basis_eval[ipt + 3*npts] = x17*x21*x9; + basis_eval[ipt + 4*npts] = x22*x28; + basis_eval[ipt + 5*npts] = x17*x2*x21; + basis_eval[ipt + 6*npts] = radial_eval*x29*x30; + basis_eval[ipt + 7*npts] = x2*x32*x8; + basis_eval[ipt + 8*npts] = sqrt_35*x22*x33; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; - basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; - basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x3*x6 + x34); + basis_x_eval[ipt + 1*npts] = x36*(radial_eval_alpha*x11 + x37); + basis_x_eval[ipt + 2*npts] = -x12*(radial_eval*(x10 + x15) - radial_eval_alpha*x16*x3); + basis_x_eval[ipt + 3*npts] = x39; + basis_x_eval[ipt + 4*npts] = 0.125*x28*x42 + 0.125*x40*(-4.0*x*x13 + x*x4 + x41); + basis_x_eval[ipt + 5*npts] = -x17*(radial_eval*(x20 + 9.0*x3) - radial_eval_alpha*x21*x3); + basis_x_eval[ipt + 6*npts] = x29*(x30*x42 + x43*(x13*x44 - x41)); + basis_x_eval[ipt + 7*npts] = x8*(radial_eval_alpha*x3*x32 + x45); + basis_x_eval[ipt + 8*npts] = x46*(x33*x42 - x43*(x4*x44 - x41)); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; - basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; - basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_y_eval[ipt + 0*npts] = sqrt_35*x47*(radial_eval_alpha*x4*x6 + x48); + basis_y_eval[ipt + 1*npts] = x8*(radial_eval_alpha*x11*x4 + x45); + basis_y_eval[ipt + 2*npts] = -sqrt_5*x47*(radial_eval*(x14 + x19 + x3) - radial_eval_alpha*x16*x4); + basis_y_eval[ipt + 3*npts] = -x17*(radial_eval*(x10 + x18 + 9.0*x4) - radial_eval_alpha*x21*x4); + basis_y_eval[ipt + 4*npts] = 0.125*x28*x50 + 0.125*x40*(-4.0*x13*y + x3*y + x49); + basis_y_eval[ipt + 5*npts] = x39; + basis_y_eval[ipt + 6*npts] = x29*(x30*x50 - x43*(x13*x51 - x49)); + basis_y_eval[ipt + 7*npts] = x36*(radial_eval_alpha*x32 + x38); + basis_y_eval[ipt + 8*npts] = x46*(x33*x50 - x43*(x3*x51 - x49)); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; - basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; - basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; - basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; - basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_z_eval[ipt + 0*npts] = x1*x42*x6*z; + basis_z_eval[ipt + 1*npts] = sqrt_70*x52*(radial_eval_alpha*x11*x13 + x34); + basis_z_eval[ipt + 2*npts] = x*x12*z*(radial_eval_alpha*x16 + x40); + basis_z_eval[ipt + 3*npts] = sqrt_10*x52*x53; + basis_z_eval[ipt + 4*npts] = -2.0*radial_eval*(x3*x54 + x4*x54 - 2.0*z*z*z) + 0.125*x28*x55; + basis_z_eval[ipt + 5*npts] = sqrt_10*x53*x56; + basis_z_eval[ipt + 6*npts] = x29*z*(radial_eval_alpha*x30 + x40*x6); + basis_z_eval[ipt + 7*npts] = sqrt_70*x56*(radial_eval_alpha*x13*x32 + x48); + basis_z_eval[ipt + 8*npts] = x33*x46*x55; + + @@ -162,25 +225,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_0 = x1*x2*x6; + ang_eval_1 = x11*x8*x9; + ang_eval_2 = x12*x16*x2; + ang_eval_3 = x17*x21*x9; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + ang_eval_0 = x22*x28; + ang_eval_1 = x17*x2*x21; + ang_eval_2 = radial_eval*x29*x30; + ang_eval_3 = x2*x32*x8; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + ang_eval_0 = sqrt_35*x22*x33; basis_eval[ipt + 8*npts] = ang_eval_0; @@ -189,18 +252,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; - dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; - dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; - dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; - dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; - dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; - dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; - dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_0 = x1*(radial_eval_alpha*x3*x6 + x34); + dang_eval_y_0 = sqrt_35*x47*(radial_eval_alpha*x4*x6 + x48); + dang_eval_z_0 = x1*x42*x6*z; + dang_eval_x_1 = x36*(radial_eval_alpha*x11 + x37); + dang_eval_y_1 = x8*(radial_eval_alpha*x11*x4 + x45); + dang_eval_z_1 = sqrt_70*x52*(radial_eval_alpha*x11*x13 + x34); + dang_eval_x_2 = -x12*(radial_eval*(x10 + x15) - radial_eval_alpha*x16*x3); + dang_eval_y_2 = -sqrt_5*x47*(radial_eval*(x14 + x19 + x3) - radial_eval_alpha*x16*x4); + dang_eval_z_2 = x*x12*z*(radial_eval_alpha*x16 + x40); + dang_eval_x_3 = x39; + dang_eval_y_3 = -x17*(radial_eval*(x10 + x18 + 9.0*x4) - radial_eval_alpha*x21*x4); + dang_eval_z_3 = sqrt_10*x52*x53; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -214,18 +277,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + dang_eval_x_0 = 0.125*x28*x42 + 0.125*x40*(-4.0*x*x13 + x*x4 + x41); + dang_eval_y_0 = 0.125*x28*x50 + 0.125*x40*(-4.0*x13*y + x3*y + x49); + dang_eval_z_0 = -2.0*radial_eval*(x3*x54 + x4*x54 - 2.0*z*z*z) + 0.125*x28*x55; + dang_eval_x_1 = -x17*(radial_eval*(x20 + 9.0*x3) - radial_eval_alpha*x21*x3); + dang_eval_y_1 = x39; + dang_eval_z_1 = sqrt_10*x53*x56; + dang_eval_x_2 = x29*(x30*x42 + x43*(x13*x44 - x41)); + dang_eval_y_2 = x29*(x30*x50 - x43*(x13*x51 - x49)); + dang_eval_z_2 = x29*z*(radial_eval_alpha*x30 + x40*x6); + dang_eval_x_3 = x8*(radial_eval_alpha*x3*x32 + x45); + dang_eval_y_3 = x36*(radial_eval_alpha*x32 + x38); + dang_eval_z_3 = sqrt_70*x56*(radial_eval_alpha*x13*x32 + x48); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -239,9 +302,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; - dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; - dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + dang_eval_x_0 = x46*(x33*x42 - x43*(x4*x44 - x41)); + dang_eval_y_0 = x46*(x33*x50 - x43*(x3*x51 - x49)); + dang_eval_z_0 = x33*x46*x55; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp index bb3845ed..38db396f 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_hessian_4( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_hessian_4( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; @@ -108,119 +111,298 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = 0.5*sqrt_35; + const auto x1 = x0*y; + const auto x2 = x*x1; + const auto x3 = x*x; + const auto x4 = x3; + const auto x5 = y*y; + const auto x6 = x5; + const auto x7 = -x6; + const auto x8 = x4 + x7; + const auto x9 = 0.25*sqrt_70; + const auto x10 = x9*z; + const auto x11 = x10*y; + const auto x12 = 3.0*x4; + const auto x13 = x12 + x7; + const auto x14 = 0.5*sqrt_5; + const auto x15 = x14*y; + const auto x16 = x*x15; + const auto x17 = z*z; + const auto x18 = x17; + const auto x19 = -6.0*x18; + const auto x20 = x19 + x6; + const auto x21 = -x20 - x4; + const auto x22 = 0.25*sqrt_10; + const auto x23 = x22*z; + const auto x24 = x23*y; + const auto x25 = -4.0*x18; + const auto x26 = 3.0*x6; + const auto x27 = x25 + x26; + const auto x28 = -x12 - x27; + const auto x29 = 0.125*radial_eval; + const auto x30 = x*x*x*x; + const auto x31 = y*y*y*y; + const auto x32 = 6.0*x4*x6; + const auto x33 = x18*x4; + const auto x34 = x18*x6; + const auto x35 = 3.0*x30 + 3.0*x31 + x32 - 24.0*x33 - 24.0*x34 + 8.0*(z*z*z*z); + const auto x36 = x*x23; + const auto x37 = 0.25*sqrt_5; + const auto x38 = -x30 + x31 + 6.0*x33 - 6.0*x34; + const auto x39 = x*x10; + const auto x40 = -x26; + const auto x41 = x4 + x40; + const auto x42 = x30 + x31 - x32; + const auto x43 = radial_eval*x13; + const auto x44 = x4*x8; + const auto x45 = x*x11; + const auto x46 = 6.0*radial_eval; + const auto x47 = radial_eval_alpha*x13; + const auto x48 = x46 + x47; + const auto x49 = -x12 - x20; + const auto x50 = x21*x4; + const auto x51 = -x46; + const auto x52 = x*x24*(radial_eval_alpha*x28 + x51); + const auto x53 = 12.0*radial_eval; + const auto x54 = x*x*x; + const auto x55 = 4.0*x; + const auto x56 = x*x6 - x18*x55 + x54; + const auto x57 = radial_eval_alpha*x; + const auto x58 = 9.0*x4; + const auto x59 = -x27 - x58; + const auto x60 = x28*x4; + const auto x61 = 4.0*radial_eval; + const auto x62 = 3.0*x; + const auto x63 = x18*x62 - x54; + const auto x64 = x12 + x40; + const auto x65 = radial_eval*x64; + const auto x66 = x4*x41; + const auto x67 = radial_eval_alpha*x66 + x65; + const auto x68 = 0.125*sqrt_35; + const auto x69 = x54 - x6*x62; + const auto x70 = x*x0; + const auto x71 = radial_eval*x41; + const auto x72 = x6*x8; + const auto x73 = x13*x6; + const auto x74 = radial_eval_alpha*x73 + x65; + const auto x75 = x*x14; + const auto x76 = x19 + x26; + const auto x77 = -x4 - x76; + const auto x78 = x21*x6; + const auto x79 = 9.0*x6; + const auto x80 = x12 + x25; + const auto x81 = -x79 - x80; + const auto x82 = x28*x6; + const auto x83 = y*y*y; + const auto x84 = 4.0*y; + const auto x85 = -x18*x84 + x4*y + x83; + const auto x86 = radial_eval_alpha*y; + const auto x87 = 3.0*y; + const auto x88 = -x18*x87 + x83; + const auto x89 = radial_eval_alpha*x41; + const auto x90 = x51 + x89; + const auto x91 = -x4*x87 + x83; + const auto x92 = x1*z; + const auto x93 = x9*y; + const auto x94 = x13*x18; + const auto x95 = x22*y; + const auto x96 = -12.0*x18; + const auto x97 = x26 + x96; + const auto x98 = -x12 - x97; + const auto x99 = x18*x28; + const auto x100 = radial_eval*x98 + radial_eval_alpha*x99; + const auto x101 = 3.0*z; + const auto x102 = -x101*x4 - x101*x6 + 2.0*(z*z*z); + const auto x103 = radial_eval_alpha*z; + const auto x104 = x37*z; + const auto x105 = x53*x8; + const auto x106 = x18*x41; + const auto x107 = 2.0*radial_eval_alpha; + const auto x108 = x107*x13; + const auto x109 = radial_eval_alpha + radial_eval_alpha_squared*x4; + const auto x110 = x108 + x109*x8; + const auto x111 = x109*x13; + const auto x112 = 12.0*radial_eval_alpha; + const auto x113 = x112*x4; + const auto x114 = x113 + x46; + const auto x115 = x107*x49 + x109*x21; + const auto x116 = x109*x35 + x53*(x6 + x80) + 24.0*x56*x57; + const auto x117 = -18.0*radial_eval; + const auto x118 = x109*x28; + const auto x119 = x107*x59 + x118; + const auto x120 = -x4; + const auto x121 = 8.0*x57; + const auto x122 = x109*x38 + x121*x63 + x53*(x120 + x18); + const auto x123 = x107*x64; + const auto x124 = x109*x41 + x123; + const auto x125 = x105 + x109*x42 + x121*x69; + const auto x126 = radial_eval_alpha*x3; + const auto x127 = radial_eval_alpha*x5; + const auto x128 = 6.0*radial_eval_alpha; + const auto x129 = x128*x6; + const auto x130 = radial_eval_alpha*x64; + const auto x131 = 24.0*radial_eval; + const auto x132 = x*x131; + const auto x133 = x132*y; + const auto x134 = 12.0*x57; + const auto x135 = 12.0*x86; + const auto x136 = radial_eval_alpha_squared*x; + const auto x137 = x136*y; + const auto x138 = -x128*x4 + x51; + const auto x139 = radial_eval_alpha*x55; + const auto x140 = radial_eval_alpha*x84; + const auto x141 = x*x93; + const auto x142 = x128*x18; + const auto x143 = -x142; + const auto x144 = x*x95*(radial_eval_alpha*x98 + radial_eval_alpha_squared*x99 + x143 + x51); + const auto x145 = 96.0*radial_eval*z; + const auto x146 = 12.0*x103; + const auto x147 = radial_eval_alpha*x17; + const auto x148 = 4.0*radial_eval_alpha; + const auto x149 = x147*x64; + const auto x150 = x68*z; + const auto x151 = x107*x41; + const auto x152 = radial_eval_alpha + radial_eval_alpha_squared*x6; + const auto x153 = x151 + x152*x8; + const auto x154 = x123 + x13*x152; + const auto x155 = x107*x77 + x152*x21; + const auto x156 = x152*x28; + const auto x157 = x107*x81 + x156; + const auto x158 = x152*x35 + x53*(x27 + x4) + 24.0*x85*x86; + const auto x159 = x112*x6; + const auto x160 = x159 + x46; + const auto x161 = 8.0*x86; + const auto x162 = x152*x38 + x161*x88 - x53*(x18 - x6); + const auto x163 = x152*x42 + x161*x91 + x53*(x120 + x6); + const auto x164 = radial_eval_alpha_squared*y; + const auto x165 = radial_eval_alpha + radial_eval_alpha_squared*x18; + const auto x166 = x165*x8; + const auto x167 = x108 + x13*x165; + const auto x168 = 24.0*radial_eval_alpha*x18 + x165*x21; + const auto x169 = x107*x98 + x165*x28; + const auto x170 = x131 + x169; + const auto x171 = -48.0*radial_eval*(-2.0*x18 + x4 + x6) + 32.0*x102*x103 + x165*x35; + const auto x172 = x105 + 24.0*x147*x8 + x165*x38; + const auto x173 = x151 + x165*x41; + const auto x174 = x165*x42; + const auto x175 = -x159; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; - basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 0*npts] = radial_eval*x2*x8; + basis_eval[ipt + 1*npts] = radial_eval*x11*x13; + basis_eval[ipt + 2*npts] = radial_eval*x16*x21; + basis_eval[ipt + 3*npts] = radial_eval*x24*x28; + basis_eval[ipt + 4*npts] = x29*x35; + basis_eval[ipt + 5*npts] = radial_eval*x28*x36; + basis_eval[ipt + 6*npts] = radial_eval*x37*x38; + basis_eval[ipt + 7*npts] = radial_eval*x39*x41; + basis_eval[ipt + 8*npts] = sqrt_35*x29*x42; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; - basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; - basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x44 + x43); + basis_x_eval[ipt + 1*npts] = x45*x48; + basis_x_eval[ipt + 2*npts] = x15*(radial_eval*x49 + radial_eval_alpha*x50); + basis_x_eval[ipt + 3*npts] = x52; + basis_x_eval[ipt + 4*npts] = 0.125*x35*x57 + 0.125*x53*x56; + basis_x_eval[ipt + 5*npts] = x23*(radial_eval*x59 + radial_eval_alpha*x60); + basis_x_eval[ipt + 6*npts] = x37*(x38*x57 + x61*x63); + basis_x_eval[ipt + 7*npts] = x10*x67; + basis_x_eval[ipt + 8*npts] = x68*(x42*x57 + x61*x69); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; - basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; - basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_y_eval[ipt + 0*npts] = x70*(radial_eval_alpha*x72 + x71); + basis_y_eval[ipt + 1*npts] = x10*x74; + basis_y_eval[ipt + 2*npts] = x75*(radial_eval*x77 + radial_eval_alpha*x78); + basis_y_eval[ipt + 3*npts] = x23*(radial_eval*x81 + radial_eval_alpha*x82); + basis_y_eval[ipt + 4*npts] = 0.125*x35*x86 + 0.125*x53*x85; + basis_y_eval[ipt + 5*npts] = x52; + basis_y_eval[ipt + 6*npts] = x37*(x38*x86 + x61*x88); + basis_y_eval[ipt + 7*npts] = x45*x90; + basis_y_eval[ipt + 8*npts] = x68*(x42*x86 + x61*x91); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; - basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; - basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; - basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; - basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_z_eval[ipt + 0*npts] = x57*x8*x92; + basis_z_eval[ipt + 1*npts] = x93*(radial_eval_alpha*x94 + x43); + basis_z_eval[ipt + 2*npts] = x16*z*(radial_eval_alpha*x21 + x53); + basis_z_eval[ipt + 3*npts] = x100*x95; + basis_z_eval[ipt + 4*npts] = 2.0*radial_eval*x102 + 0.125*x103*x35; + basis_z_eval[ipt + 5*npts] = x*x100*x22; + basis_z_eval[ipt + 6*npts] = x104*(radial_eval_alpha*x38 + x105); + basis_z_eval[ipt + 7*npts] = x*x9*(radial_eval_alpha*x106 + x71); + basis_z_eval[ipt + 8*npts] = x103*x42*x68; // Evaluate second derivative of bfn wrt xx - basis_xx_eval[ipt + 0*npts] = sqrt_35*x*y*(6*radial_eval + 2*radial_eval_alpha*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - y*y))/2; - basis_xx_eval[ipt + 1*npts] = sqrt_70*y*z*(6*radial_eval + 12*radial_eval_alpha*x*x + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x - y*y))/4; - basis_xx_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(3*x*x + y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x + y*y - 6*z*z))/2; - basis_xx_eval[ipt + 3*npts] = sqrt_10*y*z*(-6*radial_eval - 12*radial_eval_alpha*x*x - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4; - basis_xx_eval[ipt + 4*npts] = 3*radial_eval*(3*x*x + y*y - 4*z*z)/2 + 3*radial_eval_alpha*x*x*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_xx_eval[ipt + 5*npts] = sqrt_10*x*z*(-18*radial_eval - 2*radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(3*x*x + 3*y*y - 4*z*z))/4; - basis_xx_eval[ipt + 6*npts] = sqrt_5*(-12*radial_eval*(x*x - z*z) - 8*radial_eval_alpha*x*x*(x*x - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_xx_eval[ipt + 7*npts] = sqrt_70*x*z*(6*radial_eval + 6*radial_eval_alpha*(x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x - 3*y*y))/4; - basis_xx_eval[ipt + 8*npts] = sqrt_35*(12*radial_eval*(x*x - y*y) + 8*radial_eval_alpha*x*x*(x*x - 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*x*x)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_xx_eval[ipt + 0*npts] = x2*(x110 + x46); + basis_xx_eval[ipt + 1*npts] = x11*(x111 + x114); + basis_xx_eval[ipt + 2*npts] = x16*(x115 + x51); + basis_xx_eval[ipt + 3*npts] = x24*(x109*x28 - x114); + basis_xx_eval[ipt + 4*npts] = 0.125*x116; + basis_xx_eval[ipt + 5*npts] = x36*(x117 + x119); + basis_xx_eval[ipt + 6*npts] = x122*x37; + basis_xx_eval[ipt + 7*npts] = x39*(x124 + x46); + basis_xx_eval[ipt + 8*npts] = x125*x68; // Evaluate second derivative of bfn wrt xy - basis_xy_eval[ipt + 0*npts] = sqrt_35*(3*radial_eval*x*x - 3*radial_eval*y*y + radial_eval_alpha*x*x*x*x - radial_eval_alpha*y*y*y*y + radial_eval_alpha_squared*x*x*x*x*y*y - radial_eval_alpha_squared*x*x*y*y*y*y)/2; - basis_xy_eval[ipt + 1*npts] = sqrt_70*x*z*(6*radial_eval + 3*radial_eval_alpha*x*x + 3*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*y*y - radial_eval_alpha_squared*y*y*y*y)/4; - basis_xy_eval[ipt + 2*npts] = sqrt_5*(-3*radial_eval*(x*x + y*y - 2*z*z) - radial_eval_alpha*x*x*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*y*y*(x*x + y*y - 6*z*z))/2; - basis_xy_eval[ipt + 3*npts] = sqrt_10*x*z*(-6*radial_eval - 6*radial_eval_alpha*y*y - radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - basis_xy_eval[ipt + 4*npts] = x*y*(24*radial_eval + 24*radial_eval_alpha*(x*x + y*y - 4*z*z) + radial_eval_alpha_squared*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_xy_eval[ipt + 5*npts] = sqrt_10*y*z*(-6*radial_eval - 6*radial_eval_alpha*x*x - radial_eval_alpha*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - basis_xy_eval[ipt + 6*npts] = sqrt_5*x*y*(-4*radial_eval_alpha*x*x + 4*radial_eval_alpha*y*y - radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*z*z + radial_eval_alpha_squared*y*y*y*y - 6*radial_eval_alpha_squared*y*y*z*z)/4; - basis_xy_eval[ipt + 7*npts] = sqrt_70*y*z*(-6*radial_eval - 3*radial_eval_alpha*x*x - 3*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 3*radial_eval_alpha_squared*x*x*y*y)/4; - basis_xy_eval[ipt + 8*npts] = sqrt_35*x*y*(-24*radial_eval - 8*radial_eval_alpha*x*x - 8*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*y*y*y*y)/8; + basis_xy_eval[ipt + 0*npts] = x0*(radial_eval_alpha_squared*x4*x6*x8 + x126*x41 + x127*x13 + x65); + basis_xy_eval[ipt + 1*npts] = x39*(radial_eval_alpha_squared*x73 + x129 + x130 + x46); + basis_xy_eval[ipt + 2*npts] = x14*(-radial_eval*(x12 + x76) + radial_eval_alpha_squared*x21*x4*x6 + x126*x77 + x127*x49); + basis_xy_eval[ipt + 3*npts] = x36*(radial_eval_alpha*x81 + radial_eval_alpha_squared*x82 - x129 + x51); + basis_xy_eval[ipt + 4*npts] = 0.125*x133 + 0.125*x134*x85 + 0.125*x135*x56 + 0.125*x137*x35; + basis_xy_eval[ipt + 5*npts] = x24*(radial_eval_alpha*x59 + radial_eval_alpha_squared*x60 + x138); + basis_xy_eval[ipt + 6*npts] = x37*(x137*x38 + x139*x88 + x140*x63); + basis_xy_eval[ipt + 7*npts] = x11*(radial_eval_alpha_squared*x66 + x130 + x138); + basis_xy_eval[ipt + 8*npts] = x68*(-x133 + x137*x42 + x139*x91 + x140*x69); // Evaluate second derivative of bfn wrt xz - basis_xz_eval[ipt + 0*npts] = sqrt_35*y*z*(radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*x*x*(x*x - y*y))/2; - basis_xz_eval[ipt + 1*npts] = sqrt_70*x*y*(6*radial_eval + 6*radial_eval_alpha*z*z + radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*z*z*(3*x*x - y*y))/4; - basis_xz_eval[ipt + 2*npts] = sqrt_5*y*z*(12*radial_eval + 12*radial_eval_alpha*x*x - radial_eval_alpha*(3*x*x + y*y - 6*z*z) - radial_eval_alpha_squared*x*x*(x*x + y*y - 6*z*z))/2; - basis_xz_eval[ipt + 3*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_xz_eval[ipt + 4*npts] = x*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8; - basis_xz_eval[ipt + 5*npts] = sqrt_10*(-3*radial_eval*(3*x*x + y*y - 4*z*z) + 3*radial_eval_alpha*x*x*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha_squared*x*x*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_xz_eval[ipt + 6*npts] = sqrt_5*x*z*(24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) - 4*radial_eval_alpha*(x*x - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_xz_eval[ipt + 7*npts] = sqrt_70*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y) + 3*radial_eval_alpha*z*z*(x*x - y*y) + radial_eval_alpha_squared*x*x*z*z*(x*x - 3*y*y))/4; - basis_xz_eval[ipt + 8*npts] = sqrt_35*x*z*(4*radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_xz_eval[ipt + 0*npts] = x92*(radial_eval_alpha_squared*x44 + x47); + basis_xz_eval[ipt + 1*npts] = x141*(radial_eval_alpha_squared*x94 + x142 + x48); + basis_xz_eval[ipt + 2*npts] = x15*z*(radial_eval_alpha*x49 + radial_eval_alpha_squared*x50 + x113 + x53); + basis_xz_eval[ipt + 3*npts] = x144; + basis_xz_eval[ipt + 4*npts] = -0.125*x*x145 + 2.0*x102*x57 + 0.125*x136*x35*z + 0.125*x146*x56; + basis_xz_eval[ipt + 5*npts] = x22*(-radial_eval*(x58 + x97) + radial_eval_alpha_squared*x18*x28*x4 + x126*x98 + x147*x59); + basis_xz_eval[ipt + 6*npts] = x104*(x132 + x134*x8 + x136*x38 + x148*x63); + basis_xz_eval[ipt + 7*npts] = x9*(radial_eval_alpha_squared*x18*x4*x41 + x149 + x67); + basis_xz_eval[ipt + 8*npts] = x150*(x136*x42 + x148*x69); // Evaluate second derivative of bfn wrt yy - basis_yy_eval[ipt + 0*npts] = sqrt_35*x*y*(-6*radial_eval - 2*radial_eval_alpha*(-x*x + 3*y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - y*y))/2; - basis_yy_eval[ipt + 1*npts] = sqrt_70*y*z*(-6*radial_eval - 6*radial_eval_alpha*(-x*x + y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x - y*y))/4; - basis_yy_eval[ipt + 2*npts] = sqrt_5*x*y*(-6*radial_eval - 2*radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x + y*y - 6*z*z))/2; - basis_yy_eval[ipt + 3*npts] = sqrt_10*y*z*(-18*radial_eval - 2*radial_eval_alpha*(3*x*x + 9*y*y - 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4; - basis_yy_eval[ipt + 4*npts] = 3*radial_eval*(x*x + 3*y*y - 4*z*z)/2 + 3*radial_eval_alpha*y*y*(x*x + y*y - 4*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_yy_eval[ipt + 5*npts] = sqrt_10*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(3*x*x + 3*y*y - 4*z*z))/4; - basis_yy_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(y*y - z*z) + 8*radial_eval_alpha*y*y*(y*y - 3*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_yy_eval[ipt + 7*npts] = sqrt_70*x*z*(-6*radial_eval - 12*radial_eval_alpha*y*y + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x - 3*y*y))/4; - basis_yy_eval[ipt + 8*npts] = sqrt_35*(-12*radial_eval*(x*x - y*y) - 8*radial_eval_alpha*y*y*(3*x*x - y*y) + (radial_eval_alpha + radial_eval_alpha_squared*y*y)*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_yy_eval[ipt + 0*npts] = x2*(x153 + x51); + basis_yy_eval[ipt + 1*npts] = x11*(x154 + x51); + basis_yy_eval[ipt + 2*npts] = x16*(x155 + x51); + basis_yy_eval[ipt + 3*npts] = x24*(x117 + x157); + basis_yy_eval[ipt + 4*npts] = 0.125*x158; + basis_yy_eval[ipt + 5*npts] = x36*(x152*x28 - x160); + basis_yy_eval[ipt + 6*npts] = x162*x37; + basis_yy_eval[ipt + 7*npts] = x39*(x152*x41 - x160); + basis_yy_eval[ipt + 8*npts] = x163*x68; // Evaluate second derivative of bfn wrt yz - basis_yz_eval[ipt + 0*npts] = sqrt_35*x*z*(-radial_eval_alpha*(-x*x + 3*y*y) + radial_eval_alpha_squared*y*y*(x*x - y*y))/2; - basis_yz_eval[ipt + 1*npts] = sqrt_70*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y) - 3*radial_eval_alpha*z*z*(-x*x + y*y) + radial_eval_alpha_squared*y*y*z*z*(3*x*x - y*y))/4; - basis_yz_eval[ipt + 2*npts] = sqrt_5*x*z*(12*radial_eval + 12*radial_eval_alpha*y*y - radial_eval_alpha*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha_squared*y*y*(x*x + y*y - 6*z*z))/2; - basis_yz_eval[ipt + 3*npts] = sqrt_10*(-3*radial_eval*(x*x + 3*y*y - 4*z*z) + 3*radial_eval_alpha*y*y*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha_squared*y*y*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_yz_eval[ipt + 4*npts] = y*z*(-96*radial_eval - 36*radial_eval_alpha*x*x - 36*radial_eval_alpha*y*y - 16*radial_eval_alpha*z*z + 3*radial_eval_alpha_squared*x*x*x*x + 6*radial_eval_alpha_squared*x*x*y*y - 24*radial_eval_alpha_squared*x*x*z*z + 3*radial_eval_alpha_squared*y*y*y*y - 24*radial_eval_alpha_squared*y*y*z*z + 8*radial_eval_alpha_squared*z*z*z*z)/8; - basis_yz_eval[ipt + 5*npts] = sqrt_10*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + 3*radial_eval_alpha*(-x*x - y*y + 4*z*z) - radial_eval_alpha_squared*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_yz_eval[ipt + 6*npts] = sqrt_5*y*z*(-24*radial_eval + 12*radial_eval_alpha*(x*x - y*y) + 4*radial_eval_alpha*(y*y - 3*z*z) - radial_eval_alpha_squared*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_yz_eval[ipt + 7*npts] = sqrt_70*x*y*(-6*radial_eval - 6*radial_eval_alpha*z*z + radial_eval_alpha*(x*x - 3*y*y) + radial_eval_alpha_squared*z*z*(x*x - 3*y*y))/4; - basis_yz_eval[ipt + 8*npts] = sqrt_35*y*z*(-4*radial_eval_alpha*(3*x*x - y*y) + radial_eval_alpha_squared*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_yz_eval[ipt + 0*npts] = x70*z*(radial_eval_alpha_squared*x72 + x89); + basis_yz_eval[ipt + 1*npts] = x9*(radial_eval_alpha_squared*x13*x18*x6 + x149 + x74); + basis_yz_eval[ipt + 2*npts] = x75*z*(radial_eval_alpha*x77 + radial_eval_alpha_squared*x78 + x159 + x53); + basis_yz_eval[ipt + 3*npts] = x22*(-radial_eval*(x12 + x79 + x96) + radial_eval_alpha_squared*x18*x28*x6 + x127*x98 + x147*x81); + basis_yz_eval[ipt + 4*npts] = 2.0*x102*x86 - 0.125*x145*y + 0.125*x146*x85 + 0.125*x164*x35*z; + basis_yz_eval[ipt + 5*npts] = x144; + basis_yz_eval[ipt + 6*npts] = x104*(-x131*y + x135*x8 + x148*x88 + x164*x38); + basis_yz_eval[ipt + 7*npts] = x141*(radial_eval_alpha_squared*x106 + x143 + x90); + basis_yz_eval[ipt + 8*npts] = x150*(x148*x91 + x164*x42); // Evaluate second derivative of bfn wrt zz - basis_zz_eval[ipt + 0*npts] = sqrt_35*x*y*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - y*y)/2; - basis_zz_eval[ipt + 1*npts] = sqrt_70*y*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x - y*y)/4; - basis_zz_eval[ipt + 2*npts] = sqrt_5*x*y*(12*radial_eval + 24*radial_eval_alpha*z*z - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x + y*y - 6*z*z))/2; - basis_zz_eval[ipt + 3*npts] = sqrt_10*y*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4; - basis_zz_eval[ipt + 4*npts] = -6*radial_eval*(x*x + y*y - 2*z*z) - 4*radial_eval_alpha*z*z*(3*x*x + 3*y*y - 2*z*z) + (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_zz_eval[ipt + 5*npts] = sqrt_10*x*z*(24*radial_eval + 6*radial_eval_alpha*(-x*x - y*y + 4*z*z) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(3*x*x + 3*y*y - 4*z*z))/4; - basis_zz_eval[ipt + 6*npts] = sqrt_5*(12*radial_eval*(x*x - y*y) + 24*radial_eval_alpha*z*z*(x*x - y*y) - (radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_zz_eval[ipt + 7*npts] = sqrt_70*x*z*(3*radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x - 3*y*y)/4; - basis_zz_eval[ipt + 8*npts] = sqrt_35*(radial_eval_alpha + radial_eval_alpha_squared*z*z)*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_zz_eval[ipt + 0*npts] = x166*x2; + basis_zz_eval[ipt + 1*npts] = x11*x167; + basis_zz_eval[ipt + 2*npts] = x16*(x168 + x53); + basis_zz_eval[ipt + 3*npts] = x170*x24; + basis_zz_eval[ipt + 4*npts] = 0.125*x171; + basis_zz_eval[ipt + 5*npts] = x170*x36; + basis_zz_eval[ipt + 6*npts] = x172*x37; + basis_zz_eval[ipt + 7*npts] = x173*x39; + basis_zz_eval[ipt + 8*npts] = x174*x68; + + @@ -236,25 +418,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_0 = radial_eval*x2*x8; + ang_eval_1 = radial_eval*x11*x13; + ang_eval_2 = radial_eval*x16*x21; + ang_eval_3 = radial_eval*x24*x28; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + ang_eval_0 = x29*x35; + ang_eval_1 = radial_eval*x28*x36; + ang_eval_2 = radial_eval*x37*x38; + ang_eval_3 = radial_eval*x39*x41; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + ang_eval_0 = sqrt_35*x29*x42; basis_eval[ipt + 8*npts] = ang_eval_0; @@ -263,18 +445,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; - dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; - dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; - dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; - dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; - dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; - dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; - dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_0 = x1*(radial_eval_alpha*x44 + x43); + dang_eval_y_0 = x70*(radial_eval_alpha*x72 + x71); + dang_eval_z_0 = x57*x8*x92; + dang_eval_x_1 = x45*x48; + dang_eval_y_1 = x10*x74; + dang_eval_z_1 = x93*(radial_eval_alpha*x94 + x43); + dang_eval_x_2 = x15*(radial_eval*x49 + radial_eval_alpha*x50); + dang_eval_y_2 = x75*(radial_eval*x77 + radial_eval_alpha*x78); + dang_eval_z_2 = x16*z*(radial_eval_alpha*x21 + x53); + dang_eval_x_3 = x52; + dang_eval_y_3 = x23*(radial_eval*x81 + radial_eval_alpha*x82); + dang_eval_z_3 = x100*x95; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -288,18 +470,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + dang_eval_x_0 = 0.125*x35*x57 + 0.125*x53*x56; + dang_eval_y_0 = 0.125*x35*x86 + 0.125*x53*x85; + dang_eval_z_0 = 2.0*radial_eval*x102 + 0.125*x103*x35; + dang_eval_x_1 = x23*(radial_eval*x59 + radial_eval_alpha*x60); + dang_eval_y_1 = x52; + dang_eval_z_1 = x*x100*x22; + dang_eval_x_2 = x37*(x38*x57 + x61*x63); + dang_eval_y_2 = x37*(x38*x86 + x61*x88); + dang_eval_z_2 = x104*(radial_eval_alpha*x38 + x105); + dang_eval_x_3 = x10*x67; + dang_eval_y_3 = x45*x90; + dang_eval_z_3 = x*x9*(radial_eval_alpha*x106 + x71); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -313,9 +495,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; - dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; - dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + dang_eval_x_0 = x68*(x42*x57 + x61*x69); + dang_eval_y_0 = x68*(x42*x86 + x61*x91); + dang_eval_z_0 = x103*x42*x68; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp new file mode 100644 index 00000000..b895836c --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp @@ -0,0 +1,663 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "collocation_device_constants.hpp" +#include "device/xc_device_task.hpp" +#include "device_specific/cuda_device_constants.hpp" +#include "device/common/shell_to_task.hpp" +#include + +namespace GauXC { + + +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_lapgrad_4( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks +) { + + + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; + double* my_alpha = alpha[threadIdx.x/32]; + double* my_coeff = coeff[threadIdx.x/32]; + + for( auto ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + + + // Load Shell Data into registers / SM + const uint32_t nprim = shell->nprim(); + const double3 O = *reinterpret_cast(shell->O_data()); + + const int global_warp_id = (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + // Read in coeffs/exps into SM on first warp + { + auto* coeff_gm = shell->coeff_data(); + auto* alpha_gm = shell->alpha_data(); + static_assert( detail::shell_nprim_max == cuda::warp_size ); + const int warp_rank = threadIdx.x % cuda::warp_size; + my_alpha[warp_rank] = alpha_gm[warp_rank]; + my_coeff[warp_rank] = coeff_gm[warp_rank]; + } + + // Loop over tasks assigned to shells + // Place each task on a different warp + schedule across blocks + for( int itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const auto* __restrict__ points_x = task->points_x; + const auto* __restrict__ points_y = task->points_y; + const auto* __restrict__ points_z = task->points_z; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + + auto* __restrict__ basis_eval = task->bf + shoff; + auto* __restrict__ basis_x_eval = task->dbfx + shoff; + auto* __restrict__ basis_y_eval = task->dbfy + shoff; + auto* __restrict__ basis_z_eval = task->dbfz + shoff; + auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; + + // Loop over points in task + // Assign each point to separate thread within the warp + #pragma unroll 1 + for( int ipt = threadIdx.x % cuda::warp_size; ipt < npts; ipt += cuda::warp_size ) { + //const double3 point = points[ipt]; + double3 point; + point.x = points_x[ipt]; + point.y = points_y[ipt]; + point.z = points_z[ipt]; + + + const auto x = point.x - O.x; + const auto y = point.y - O.y; + const auto z = point.z - O.z; + const auto rsq = x*x + y*y + z*z; + + // Evaluate radial part of bfn + double radial_eval = 0.; + double radial_eval_alpha = 0.; + double radial_eval_alpha_squared = 0.; + double radial_eval_alpha_cubed = 0.; + + #pragma unroll 1 + for( uint32_t i = 0; i < nprim; ++i ) { + const auto a = my_alpha[i]; + const auto e = my_coeff[i] * std::exp( - a * rsq ); + + radial_eval += e; + radial_eval_alpha += a * e; + radial_eval_alpha_squared += a * a * e; + radial_eval_alpha_cubed += a * a * a * e; + } + + radial_eval_alpha *= -2; + radial_eval_alpha_squared *= 4; + radial_eval_alpha_cubed *= -8; + + // Common Subexpressions + const auto x0 = 0.5*sqrt_35; + const auto x1 = x0*y; + const auto x2 = x*x1; + const auto x3 = x*x; + const auto x4 = x3; + const auto x5 = y*y; + const auto x6 = x5; + const auto x7 = -x6; + const auto x8 = x4 + x7; + const auto x9 = 0.25*sqrt_70; + const auto x10 = x9*z; + const auto x11 = x10*y; + const auto x12 = 3.0*x4; + const auto x13 = x12 + x7; + const auto x14 = 0.5*sqrt_5; + const auto x15 = x14*y; + const auto x16 = x*x15; + const auto x17 = z*z; + const auto x18 = x17; + const auto x19 = -6.0*x18; + const auto x20 = x19 + x6; + const auto x21 = -x20 - x4; + const auto x22 = 0.25*sqrt_10; + const auto x23 = x22*z; + const auto x24 = x23*y; + const auto x25 = -4.0*x18; + const auto x26 = 3.0*x6; + const auto x27 = x25 + x26; + const auto x28 = -x12 - x27; + const auto x29 = 0.125*radial_eval; + const auto x30 = x*x*x*x; + const auto x31 = y*y*y*y; + const auto x32 = x4*x6; + const auto x33 = 6.0*x32; + const auto x34 = x18*x4; + const auto x35 = x18*x6; + const auto x36 = 3.0*x30 + 3.0*x31 + x33 - 24.0*x34 - 24.0*x35 + 8.0*(z*z*z*z); + const auto x37 = x*x23; + const auto x38 = 0.25*sqrt_5; + const auto x39 = -x30 + x31 + 6.0*x34 - 6.0*x35; + const auto x40 = x*x10; + const auto x41 = -x26; + const auto x42 = x4 + x41; + const auto x43 = x30 + x31 - x33; + const auto x44 = radial_eval*x13; + const auto x45 = x4*x8; + const auto x46 = x*x11; + const auto x47 = 6.0*radial_eval; + const auto x48 = radial_eval_alpha*x13; + const auto x49 = x47 + x48; + const auto x50 = -x12 - x20; + const auto x51 = x21*x4; + const auto x52 = -x47; + const auto x53 = x*x24*(radial_eval_alpha*x28 + x52); + const auto x54 = 12.0*radial_eval; + const auto x55 = x*x*x; + const auto x56 = 4.0*x; + const auto x57 = x*x6 - x18*x56 + x55; + const auto x58 = radial_eval_alpha*x; + const auto x59 = 9.0*x4; + const auto x60 = -x27 - x59; + const auto x61 = x28*x4; + const auto x62 = 4.0*radial_eval; + const auto x63 = 3.0*x; + const auto x64 = x18*x63 - x55; + const auto x65 = x12 + x41; + const auto x66 = radial_eval*x65; + const auto x67 = x4*x42; + const auto x68 = radial_eval_alpha*x67 + x66; + const auto x69 = 0.125*sqrt_35; + const auto x70 = x55 - x6*x63; + const auto x71 = x*x0; + const auto x72 = radial_eval*x42; + const auto x73 = x6*x8; + const auto x74 = x13*x6; + const auto x75 = radial_eval_alpha*x74 + x66; + const auto x76 = x*x14; + const auto x77 = x19 + x26; + const auto x78 = -x4 - x77; + const auto x79 = x21*x6; + const auto x80 = 9.0*x6; + const auto x81 = x12 + x25; + const auto x82 = -x80 - x81; + const auto x83 = x28*x6; + const auto x84 = y*y*y; + const auto x85 = 4.0*y; + const auto x86 = -x18*x85 + x4*y + x84; + const auto x87 = radial_eval_alpha*y; + const auto x88 = 3.0*y; + const auto x89 = -x18*x88 + x84; + const auto x90 = radial_eval_alpha*x42; + const auto x91 = x52 + x90; + const auto x92 = -x4*x88 + x84; + const auto x93 = x1*z; + const auto x94 = x9*y; + const auto x95 = x13*x18; + const auto x96 = x22*y; + const auto x97 = -12.0*x18; + const auto x98 = x26 + x97; + const auto x99 = -x12 - x98; + const auto x100 = x18*x28; + const auto x101 = radial_eval*x99 + radial_eval_alpha*x100; + const auto x102 = z*z*z; + const auto x103 = 3.0*z; + const auto x104 = 2.0*x102 - x103*x4 - x103*x6; + const auto x105 = radial_eval_alpha*z; + const auto x106 = x*x22; + const auto x107 = x38*z; + const auto x108 = x54*x8; + const auto x109 = x*x9; + const auto x110 = x18*x42; + const auto x111 = 2.0*radial_eval_alpha; + const auto x112 = x111*x13; + const auto x113 = radial_eval_alpha + radial_eval_alpha_squared*x4; + const auto x114 = x113*x8; + const auto x115 = x112 + x114; + const auto x116 = x113*x13; + const auto x117 = 12.0*radial_eval_alpha; + const auto x118 = x117*x4; + const auto x119 = x118 + x47; + const auto x120 = x111*x50 + x113*x21; + const auto x121 = x6 + x81; + const auto x122 = x113*x36 + x121*x54 + 24.0*x57*x58; + const auto x123 = -18.0*radial_eval; + const auto x124 = x113*x28; + const auto x125 = x111*x60 + x124; + const auto x126 = -x4; + const auto x127 = x126 + x18; + const auto x128 = 8.0*x58; + const auto x129 = x113*x39 + x127*x54 + x128*x64; + const auto x130 = x111*x65; + const auto x131 = x113*x42 + x130; + const auto x132 = x108 + x113*x43 + x128*x70; + const auto x133 = radial_eval_alpha*x3; + const auto x134 = radial_eval_alpha*x5; + const auto x135 = 6.0*radial_eval_alpha; + const auto x136 = x135*x6; + const auto x137 = radial_eval_alpha*x65; + const auto x138 = -x12 - x77; + const auto x139 = 24.0*radial_eval; + const auto x140 = x*x139; + const auto x141 = x140*y; + const auto x142 = 12.0*x58; + const auto x143 = 12.0*x87; + const auto x144 = radial_eval_alpha_squared*x; + const auto x145 = x144*y; + const auto x146 = -x135*x4 + x52; + const auto x147 = radial_eval_alpha*x56; + const auto x148 = radial_eval_alpha*x85; + const auto x149 = x*x94; + const auto x150 = x135*x18; + const auto x151 = -x150; + const auto x152 = x*x96*(radial_eval_alpha*x99 + radial_eval_alpha_squared*x100 + x151 + x52); + const auto x153 = 96.0*radial_eval*z; + const auto x154 = 12.0*x105; + const auto x155 = x144*z; + const auto x156 = -x59 - x98; + const auto x157 = radial_eval_alpha*x17; + const auto x158 = x142*x8; + const auto x159 = 4.0*radial_eval_alpha; + const auto x160 = x157*x65; + const auto x161 = x69*z; + const auto x162 = x111*x42; + const auto x163 = radial_eval_alpha + radial_eval_alpha_squared*x6; + const auto x164 = x163*x8; + const auto x165 = x162 + x164; + const auto x166 = x13*x163 + x130; + const auto x167 = x111*x78 + x163*x21; + const auto x168 = x163*x28; + const auto x169 = x111*x82 + x168; + const auto x170 = x27 + x4; + const auto x171 = x163*x36 + x170*x54 + 24.0*x86*x87; + const auto x172 = x117*x6; + const auto x173 = x172 + x47; + const auto x174 = -x18 + x6; + const auto x175 = 8.0*x87; + const auto x176 = x163*x39 + x174*x54 + x175*x89; + const auto x177 = x126 + x6; + const auto x178 = x163*x43 + x175*x92 + x177*x54; + const auto x179 = -x12 - x80 - x97; + const auto x180 = radial_eval_alpha_squared*y; + const auto x181 = x180*z; + const auto x182 = x143*x8; + const auto x183 = radial_eval_alpha + radial_eval_alpha_squared*x18; + const auto x184 = x183*x8; + const auto x185 = x13*x183; + const auto x186 = x112 + x185; + const auto x187 = 24.0*radial_eval_alpha*x18; + const auto x188 = x183*x21 + x187; + const auto x189 = x111*x99 + x183*x28; + const auto x190 = x139 + x189; + const auto x191 = 2.0*x18 - x4 - x6; + const auto x192 = 48.0*radial_eval*x191 + 32.0*x104*x105 + x183*x36; + const auto x193 = x108 + 24.0*x157*x8 + x183*x39; + const auto x194 = x183*x42; + const auto x195 = x162 + x194; + const auto x196 = x183*x43; + const auto x197 = x118 + x166; + const auto x198 = x116 + x197; + const auto x199 = -x118; + const auto x200 = -x172; + const auto x201 = x163*x42; + const auto x202 = x131 + x200; + const auto x203 = x201 + x202; + const auto x204 = radial_eval_alpha_cubed*x55 + radial_eval_alpha_squared*x63; + const auto x205 = radial_eval_alpha_cubed*x6 + radial_eval_alpha_squared; + const auto x206 = x205*x8; + const auto x207 = radial_eval_alpha_cubed*x18 + radial_eval_alpha_squared; + const auto x208 = x207*x8; + const auto x209 = 2.0*radial_eval_alpha_squared; + const auto x210 = x209*x3; + const auto x211 = 36.0*x58; + const auto x212 = 18.0*x*x113; + const auto x213 = 6.0*x; + const auto x214 = x163*x213; + const auto x215 = x183*x213; + const auto x216 = 2.0*x144; + const auto x217 = x13*x205; + const auto x218 = x13*x207; + const auto x219 = x205*x21; + const auto x220 = x207*x21; + const auto x221 = 24.0*radial_eval_alpha_squared; + const auto x222 = x111*x138 + x187; + const auto x223 = x205*x28; + const auto x224 = x207*x28; + const auto x225 = x204*x28; + const auto x226 = 48.0*x58; + const auto x227 = x226*x6; + const auto x228 = 24.0*x145; + const auto x229 = x205*x36; + const auto x230 = x207*x36; + const auto x231 = 36.0*radial_eval_alpha; + const auto x232 = x111*x156; + const auto x233 = 12.0*radial_eval_alpha_squared; + const auto x234 = x233*x32; + const auto x235 = -x234; + const auto x236 = x200 + x235; + const auto x237 = 8.0*x145; + const auto x238 = 24.0*x17; + const auto x239 = x205*x39; + const auto x240 = x207*x39; + const auto x241 = x163*x65; + const auto x242 = x113*x65; + const auto x243 = x205*x42; + const auto x244 = x207*x42; + const auto x245 = x118 + x130 + x183*x65; + const auto x246 = x205*x43; + const auto x247 = x207*x43; + const auto x248 = radial_eval_alpha_cubed*x84 + radial_eval_alpha_squared*x88; + const auto x249 = radial_eval_alpha_cubed*x4 + radial_eval_alpha_squared; + const auto x250 = x249*x8; + const auto x251 = x209*x5; + const auto x252 = x13*x249; + const auto x253 = x21*x249; + const auto x254 = x248*x28; + const auto x255 = x249*x28; + const auto x256 = x111*x179 + x199; + const auto x257 = 48.0*x87; + const auto x258 = x257*x4; + const auto x259 = 36.0*x87; + const auto x260 = x249*x36; + const auto x261 = 2.0*x180; + const auto x262 = 6.0*y; + const auto x263 = -x113*x262 - 18.0*x163*y - x183*x262 - x259; + const auto x264 = x249*x39; + const auto x265 = x249*x42; + const auto x266 = x249*x43; + const auto x267 = x209*z; + const auto x268 = radial_eval_alpha_cubed*x102 + radial_eval_alpha_squared*x103; + const auto x269 = x17*x209; + const auto x270 = x269*x65; + const auto x271 = x233*x34; + const auto x272 = 12.0*z; + const auto x273 = 36.0*z; + const auto x274 = 48.0*radial_eval_alpha*x18 + x113*x99 + x163*x99 + x17*x223 + x17*x255 + 3.0*x183*x99 + x268*x28*z; + const auto x275 = 192.0*x105; + const auto x276 = -x233*x35; + const auto x277 = 48.0*x105; + const auto x278 = 8.0*x181; + const auto x279 = 8.0*x155; + + + // Evaluate basis function + basis_eval[ipt + 0*npts] = radial_eval*x2*x8; + basis_eval[ipt + 1*npts] = radial_eval*x11*x13; + basis_eval[ipt + 2*npts] = radial_eval*x16*x21; + basis_eval[ipt + 3*npts] = radial_eval*x24*x28; + basis_eval[ipt + 4*npts] = x29*x36; + basis_eval[ipt + 5*npts] = radial_eval*x28*x37; + basis_eval[ipt + 6*npts] = radial_eval*x38*x39; + basis_eval[ipt + 7*npts] = radial_eval*x40*x42; + basis_eval[ipt + 8*npts] = sqrt_35*x29*x43; + + + + // Evaluate first derivative of bfn wrt x + basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x45 + x44); + basis_x_eval[ipt + 1*npts] = x46*x49; + basis_x_eval[ipt + 2*npts] = x15*(radial_eval*x50 + radial_eval_alpha*x51); + basis_x_eval[ipt + 3*npts] = x53; + basis_x_eval[ipt + 4*npts] = 0.125*x36*x58 + 0.125*x54*x57; + basis_x_eval[ipt + 5*npts] = x23*(radial_eval*x60 + radial_eval_alpha*x61); + basis_x_eval[ipt + 6*npts] = x38*(x39*x58 + x62*x64); + basis_x_eval[ipt + 7*npts] = x10*x68; + basis_x_eval[ipt + 8*npts] = x69*(x43*x58 + x62*x70); + + // Evaluate first derivative of bfn wrt y + basis_y_eval[ipt + 0*npts] = x71*(radial_eval_alpha*x73 + x72); + basis_y_eval[ipt + 1*npts] = x10*x75; + basis_y_eval[ipt + 2*npts] = x76*(radial_eval*x78 + radial_eval_alpha*x79); + basis_y_eval[ipt + 3*npts] = x23*(radial_eval*x82 + radial_eval_alpha*x83); + basis_y_eval[ipt + 4*npts] = 0.125*x36*x87 + 0.125*x54*x86; + basis_y_eval[ipt + 5*npts] = x53; + basis_y_eval[ipt + 6*npts] = x38*(x39*x87 + x62*x89); + basis_y_eval[ipt + 7*npts] = x46*x91; + basis_y_eval[ipt + 8*npts] = x69*(x43*x87 + x62*x92); + + // Evaluate first derivative of bfn wrt z + basis_z_eval[ipt + 0*npts] = x58*x8*x93; + basis_z_eval[ipt + 1*npts] = x94*(radial_eval_alpha*x95 + x44); + basis_z_eval[ipt + 2*npts] = x16*z*(radial_eval_alpha*x21 + x54); + basis_z_eval[ipt + 3*npts] = x101*x96; + basis_z_eval[ipt + 4*npts] = 2.0*radial_eval*x104 + 0.125*x105*x36; + basis_z_eval[ipt + 5*npts] = x101*x106; + basis_z_eval[ipt + 6*npts] = x107*(radial_eval_alpha*x39 + x108); + basis_z_eval[ipt + 7*npts] = x109*(radial_eval_alpha*x110 + x72); + basis_z_eval[ipt + 8*npts] = x105*x43*x69; + + // Evaluate second derivative of bfn wrt xx + basis_xx_eval[ipt + 0*npts] = x2*(x115 + x47); + basis_xx_eval[ipt + 1*npts] = x11*(x116 + x119); + basis_xx_eval[ipt + 2*npts] = x16*(x120 + x52); + basis_xx_eval[ipt + 3*npts] = x24*(x113*x28 - x119); + basis_xx_eval[ipt + 4*npts] = 0.125*x122; + basis_xx_eval[ipt + 5*npts] = x37*(x123 + x125); + basis_xx_eval[ipt + 6*npts] = x129*x38; + basis_xx_eval[ipt + 7*npts] = x40*(x131 + x47); + basis_xx_eval[ipt + 8*npts] = x132*x69; + + // Evaluate second derivative of bfn wrt xy + basis_xy_eval[ipt + 0*npts] = x0*(radial_eval_alpha_squared*x4*x6*x8 + x13*x134 + x133*x42 + x66); + basis_xy_eval[ipt + 1*npts] = x40*(radial_eval_alpha_squared*x74 + x136 + x137 + x47); + basis_xy_eval[ipt + 2*npts] = x14*(radial_eval*x138 + radial_eval_alpha_squared*x21*x4*x6 + x133*x78 + x134*x50); + basis_xy_eval[ipt + 3*npts] = x37*(radial_eval_alpha*x82 + radial_eval_alpha_squared*x83 - x136 + x52); + basis_xy_eval[ipt + 4*npts] = 0.125*x141 + 0.125*x142*x86 + 0.125*x143*x57 + 0.125*x145*x36; + basis_xy_eval[ipt + 5*npts] = x24*(radial_eval_alpha*x60 + radial_eval_alpha_squared*x61 + x146); + basis_xy_eval[ipt + 6*npts] = x38*(x145*x39 + x147*x89 + x148*x64); + basis_xy_eval[ipt + 7*npts] = x11*(radial_eval_alpha_squared*x67 + x137 + x146); + basis_xy_eval[ipt + 8*npts] = x69*(-x141 + x145*x43 + x147*x92 + x148*x70); + + // Evaluate second derivative of bfn wrt xz + basis_xz_eval[ipt + 0*npts] = x93*(radial_eval_alpha_squared*x45 + x48); + basis_xz_eval[ipt + 1*npts] = x149*(radial_eval_alpha_squared*x95 + x150 + x49); + basis_xz_eval[ipt + 2*npts] = x15*z*(radial_eval_alpha*x50 + radial_eval_alpha_squared*x51 + x118 + x54); + basis_xz_eval[ipt + 3*npts] = x152; + basis_xz_eval[ipt + 4*npts] = -0.125*x*x153 + 2.0*x104*x58 + 0.125*x154*x57 + 0.125*x155*x36; + basis_xz_eval[ipt + 5*npts] = x22*(radial_eval*x156 + radial_eval_alpha_squared*x18*x28*x4 + x133*x99 + x157*x60); + basis_xz_eval[ipt + 6*npts] = x107*(x140 + x144*x39 + x158 + x159*x64); + basis_xz_eval[ipt + 7*npts] = x9*(radial_eval_alpha_squared*x18*x4*x42 + x160 + x68); + basis_xz_eval[ipt + 8*npts] = x161*(x144*x43 + x159*x70); + + // Evaluate second derivative of bfn wrt yy + basis_yy_eval[ipt + 0*npts] = x2*(x165 + x52); + basis_yy_eval[ipt + 1*npts] = x11*(x166 + x52); + basis_yy_eval[ipt + 2*npts] = x16*(x167 + x52); + basis_yy_eval[ipt + 3*npts] = x24*(x123 + x169); + basis_yy_eval[ipt + 4*npts] = 0.125*x171; + basis_yy_eval[ipt + 5*npts] = x37*(x163*x28 - x173); + basis_yy_eval[ipt + 6*npts] = x176*x38; + basis_yy_eval[ipt + 7*npts] = x40*(x163*x42 - x173); + basis_yy_eval[ipt + 8*npts] = x178*x69; + + // Evaluate second derivative of bfn wrt yz + basis_yz_eval[ipt + 0*npts] = x71*z*(radial_eval_alpha_squared*x73 + x90); + basis_yz_eval[ipt + 1*npts] = x9*(radial_eval_alpha_squared*x13*x18*x6 + x160 + x75); + basis_yz_eval[ipt + 2*npts] = x76*z*(radial_eval_alpha*x78 + radial_eval_alpha_squared*x79 + x172 + x54); + basis_yz_eval[ipt + 3*npts] = x22*(radial_eval*x179 + radial_eval_alpha_squared*x18*x28*x6 + x134*x99 + x157*x82); + basis_yz_eval[ipt + 4*npts] = 2.0*x104*x87 - 0.125*x153*y + 0.125*x154*x86 + 0.125*x181*x36; + basis_yz_eval[ipt + 5*npts] = x152; + basis_yz_eval[ipt + 6*npts] = x107*(-x139*y + x159*x89 + x180*x39 + x182); + basis_yz_eval[ipt + 7*npts] = x149*(radial_eval_alpha_squared*x110 + x151 + x91); + basis_yz_eval[ipt + 8*npts] = x161*(x159*x92 + x180*x43); + + // Evaluate second derivative of bfn wrt zz + basis_zz_eval[ipt + 0*npts] = x184*x2; + basis_zz_eval[ipt + 1*npts] = x11*x186; + basis_zz_eval[ipt + 2*npts] = x16*(x188 + x54); + basis_zz_eval[ipt + 3*npts] = x190*x24; + basis_zz_eval[ipt + 4*npts] = 0.125*x192; + basis_zz_eval[ipt + 5*npts] = x190*x37; + basis_zz_eval[ipt + 6*npts] = x193*x38; + basis_zz_eval[ipt + 7*npts] = x195*x40; + basis_zz_eval[ipt + 8*npts] = x196*x69; + + // Evaluate Laplacian of bfn + basis_lapl_eval[ipt + 0*npts] = x2*(x115 + x165 + x184); + basis_lapl_eval[ipt + 1*npts] = x11*(x186 + x198); + basis_lapl_eval[ipt + 2*npts] = x16*(x120 + x167 + x188); + basis_lapl_eval[ipt + 3*npts] = x24*(x124 + x169 + x189 + x199); + basis_lapl_eval[ipt + 4*npts] = 0.125*x122 + 0.125*x171 + 0.125*x192; + basis_lapl_eval[ipt + 5*npts] = x37*(x125 + x168 + x189 + x200); + basis_lapl_eval[ipt + 6*npts] = x38*(x129 + x176 + x193); + basis_lapl_eval[ipt + 7*npts] = x40*(x195 + x203); + basis_lapl_eval[ipt + 8*npts] = x69*(x132 + x178 + x196); + + // Evaluate Laplacian gradient of bfn (dx) + basis_lapl_x_eval[ipt + 0*npts] = x1*(x*x204*x8 + 3.0*x116 + x185 + x197 + x206*x3 + x208*x3 + x210*x42); + basis_lapl_x_eval[ipt + 1*npts] = x11*(x*x217 + x*x218 + x13*x204 + x13*x216 + x211 + x212 + x214 + x215 + x216*x65); + basis_lapl_x_eval[ipt + 2*npts] = x15*(x*x204*x21 + 3.0*x113*x50 + x163*x50 + x183*x50 + x199 + x210*x78 + x219*x3 + x220*x3 + x221*x34 + x222); + basis_lapl_x_eval[ipt + 3*npts] = x24*(x*x223 + x*x224 - x211 - x212 - x214 - x215 + x216*x82 + x216*x99 + x225); + basis_lapl_x_eval[ipt + 4*npts] = 0.125*x*x229 + 0.125*x*x230 + 4.0*x104*x155 + 4.5*x113*x57 + 0.125*x121*x211 + 0.125*x142*x170 + 1.5*x163*x57 - 24.0*x18*x58 + 1.5*x183*x57 + 0.125*x191*x226 + 0.125*x204*x36 + 0.125*x227 + 0.125*x228*x86; + basis_lapl_x_eval[ipt + 5*npts] = x23*(x*x225 + 3.0*x113*x60 + x163*x60 + x183*x60 + x210*x99 + x223*x3 + x224*x3 - x231*x4 + x232 + x236); + basis_lapl_x_eval[ipt + 6*npts] = x38*(x*x239 + x*x240 + 12.0*x113*x64 + x127*x211 + x142*x174 + x144*x238*x8 + x158 + 4.0*x163*x64 + x18*x226 + 4.0*x183*x64 + x204*x39 + x237*x89); + basis_lapl_x_eval[ipt + 7*npts] = x10*(x*x204*x42 + x209*x67 + x236 + x241 + 3.0*x242 + x243*x3 + x244*x3 + x245); + basis_lapl_x_eval[ipt + 8*npts] = x69*(x*x246 + x*x247 + 12.0*x113*x70 + x142*x177 + 4.0*x163*x70 + 4.0*x183*x70 + x204*x43 + x211*x8 - x227 + x237*x92); + // Evaluate Laplacian gradient of bfn (dy) + basis_lapl_y_eval[ipt + 0*npts] = x71*(x13*x251 + x194 + 3.0*x201 + x202 + x208*x5 + x248*x8*y + x250*x5); + basis_lapl_y_eval[ipt + 1*npts] = x10*(x13*x248*y + x200 + x209*x74 + x218*x5 + x234 + 3.0*x241 + x242 + x245 + x252*x5); + basis_lapl_y_eval[ipt + 2*npts] = x76*(x113*x78 + 3.0*x163*x78 + x183*x78 + x200 + x21*x248*y + x220*x5 + x221*x35 + x222 + x251*x50 + x253*x5); + basis_lapl_y_eval[ipt + 3*npts] = x23*(x113*x82 + 3.0*x163*x82 + x183*x82 + x224*x5 - x231*x6 + x235 + x251*x99 + x254*y + x255*x5 + x256); + basis_lapl_y_eval[ipt + 4*npts] = 4.0*x104*x181 + 1.5*x113*x86 + 0.125*x121*x143 + 4.5*x163*x86 + 0.125*x170*x259 - 24.0*x18*x87 + 1.5*x183*x86 + 0.125*x191*x257 + 0.125*x228*x57 + 0.125*x230*y + 0.125*x248*x36 + 0.125*x258 + 0.125*x260*y; + basis_lapl_y_eval[ipt + 5*npts] = x37*(x224*y + x254 + x255*y + x261*x60 + x261*x99 + x263); + basis_lapl_y_eval[ipt + 6*npts] = x38*(4.0*x113*x89 + x127*x143 + 12.0*x163*x89 + x174*x259 - x18*x257 + x180*x238*x8 + x182 + 4.0*x183*x89 + x237*x64 + x240*y + x248*x39 + x264*y); + basis_lapl_y_eval[ipt + 7*npts] = x40*(x244*y + x248*x42 + x261*x42 + x261*x65 + x263 + x265*y); + basis_lapl_y_eval[ipt + 8*npts] = x69*(4.0*x113*x92 + 12.0*x163*x92 + x177*x259 + x182 + 4.0*x183*x92 + x237*x70 + x247*y + x248*x43 - x258 + x266*y); + // Evaluate Laplacian gradient of bfn (dz) + basis_lapl_z_eval[ipt + 0*npts] = x2*(x13*x267 + x206*z + x250*z + x267*x42 + x268*x8); + basis_lapl_z_eval[ipt + 1*npts] = x94*(x13*x268*z + x17*x217 + x17*x252 + 3.0*x185 + x198 + x270 + x271); + basis_lapl_z_eval[ipt + 2*npts] = x16*(72.0*x105 + x113*x272 + x163*x272 + x183*x273 + x21*x268 + x219*z + x253*z + x267*x50 + x267*x78); + basis_lapl_z_eval[ipt + 3*npts] = x96*(x256 + x269*x82 - x271 + x274); + basis_lapl_z_eval[ipt + 4*npts] = 2.0*x104*x113 + 2.0*x104*x163 + 6.0*x104*x183 + 18.0*x105*x191 + 0.125*x121*x154 + 0.125*x154*x170 + 3.0*x155*x57 + 3.0*x181*x86 + 0.125*x229*z + 0.125*x260*z + 0.125*x268*x36 - 0.125*x275*x4 - 0.125*x275*x6; + basis_lapl_z_eval[ipt + 5*npts] = x106*(x200 + x232 + x269*x60 + x274 + x276); + basis_lapl_z_eval[ipt + 6*npts] = x38*(36.0*x105*x8 + x114*x272 + x127*x154 + x154*x174 + x164*x272 + x184*x273 + x239*z + x264*z + x268*x39 + x277*x4 - x277*x6 + x278*x89 + x279*x64); + basis_lapl_z_eval[ipt + 7*npts] = x109*(x17*x243 + x17*x265 + 3.0*x194 + x203 + x268*x42*z + x270 + x276); + basis_lapl_z_eval[ipt + 8*npts] = x69*(x154*x177 + x154*x8 + x246*z + x266*z + x268*x43 + x278*x92 + x279*x70); + + + + +#if 0 + // Evaluate the angular part of bfn + + + + double ang_eval_0; + double ang_eval_1; + double ang_eval_2; + double ang_eval_3; + + + ang_eval_0 = radial_eval*x2*x8; + ang_eval_1 = radial_eval*x11*x13; + ang_eval_2 = radial_eval*x16*x21; + ang_eval_3 = radial_eval*x24*x28; + basis_eval[ipt + 0*npts] = ang_eval_0; + basis_eval[ipt + 1*npts] = ang_eval_1; + basis_eval[ipt + 2*npts] = ang_eval_2; + basis_eval[ipt + 3*npts] = ang_eval_3; + + ang_eval_0 = x29*x36; + ang_eval_1 = radial_eval*x28*x37; + ang_eval_2 = radial_eval*x38*x39; + ang_eval_3 = radial_eval*x40*x42; + basis_eval[ipt + 4*npts] = ang_eval_0; + basis_eval[ipt + 5*npts] = ang_eval_1; + basis_eval[ipt + 6*npts] = ang_eval_2; + basis_eval[ipt + 7*npts] = ang_eval_3; + + ang_eval_0 = sqrt_35*x29*x43; + basis_eval[ipt + 8*npts] = ang_eval_0; + + + double dang_eval_x_0, dang_eval_y_0, dang_eval_z_0; + double dang_eval_x_1, dang_eval_y_1, dang_eval_z_1; + double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; + double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; + + dang_eval_x_0 = x1*(radial_eval_alpha*x45 + x44); + dang_eval_y_0 = x71*(radial_eval_alpha*x73 + x72); + dang_eval_z_0 = x58*x8*x93; + dang_eval_x_1 = x46*x49; + dang_eval_y_1 = x10*x75; + dang_eval_z_1 = x94*(radial_eval_alpha*x95 + x44); + dang_eval_x_2 = x15*(radial_eval*x50 + radial_eval_alpha*x51); + dang_eval_y_2 = x76*(radial_eval*x78 + radial_eval_alpha*x79); + dang_eval_z_2 = x16*z*(radial_eval_alpha*x21 + x54); + dang_eval_x_3 = x53; + dang_eval_y_3 = x23*(radial_eval*x82 + radial_eval_alpha*x83); + dang_eval_z_3 = x101*x96; + basis_x_eval[ipt + 0*npts] = dang_eval_x_0; + basis_y_eval[ipt + 0*npts] = dang_eval_y_0; + basis_z_eval[ipt + 0*npts] = dang_eval_z_0; + basis_x_eval[ipt + 1*npts] = dang_eval_x_1; + basis_y_eval[ipt + 1*npts] = dang_eval_y_1; + basis_z_eval[ipt + 1*npts] = dang_eval_z_1; + basis_x_eval[ipt + 2*npts] = dang_eval_x_2; + basis_y_eval[ipt + 2*npts] = dang_eval_y_2; + basis_z_eval[ipt + 2*npts] = dang_eval_z_2; + basis_x_eval[ipt + 3*npts] = dang_eval_x_3; + basis_y_eval[ipt + 3*npts] = dang_eval_y_3; + basis_z_eval[ipt + 3*npts] = dang_eval_z_3; + + dang_eval_x_0 = 0.125*x36*x58 + 0.125*x54*x57; + dang_eval_y_0 = 0.125*x36*x87 + 0.125*x54*x86; + dang_eval_z_0 = 2.0*radial_eval*x104 + 0.125*x105*x36; + dang_eval_x_1 = x23*(radial_eval*x60 + radial_eval_alpha*x61); + dang_eval_y_1 = x53; + dang_eval_z_1 = x101*x106; + dang_eval_x_2 = x38*(x39*x58 + x62*x64); + dang_eval_y_2 = x38*(x39*x87 + x62*x89); + dang_eval_z_2 = x107*(radial_eval_alpha*x39 + x108); + dang_eval_x_3 = x10*x68; + dang_eval_y_3 = x46*x91; + dang_eval_z_3 = x109*(radial_eval_alpha*x110 + x72); + basis_x_eval[ipt + 4*npts] = dang_eval_x_0; + basis_y_eval[ipt + 4*npts] = dang_eval_y_0; + basis_z_eval[ipt + 4*npts] = dang_eval_z_0; + basis_x_eval[ipt + 5*npts] = dang_eval_x_1; + basis_y_eval[ipt + 5*npts] = dang_eval_y_1; + basis_z_eval[ipt + 5*npts] = dang_eval_z_1; + basis_x_eval[ipt + 6*npts] = dang_eval_x_2; + basis_y_eval[ipt + 6*npts] = dang_eval_y_2; + basis_z_eval[ipt + 6*npts] = dang_eval_z_2; + basis_x_eval[ipt + 7*npts] = dang_eval_x_3; + basis_y_eval[ipt + 7*npts] = dang_eval_y_3; + basis_z_eval[ipt + 7*npts] = dang_eval_z_3; + + dang_eval_x_0 = x69*(x43*x58 + x62*x70); + dang_eval_y_0 = x69*(x43*x87 + x62*x92); + dang_eval_z_0 = x105*x43*x69; + basis_x_eval[ipt + 8*npts] = dang_eval_x_0; + basis_y_eval[ipt + 8*npts] = dang_eval_y_0; + basis_z_eval[ipt + 8*npts] = dang_eval_z_0; + +#endif + } // Loop over points within task + } // Loop over tasks + + } // Loop over shells +} // end kernel + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp index 6f129915..f5b3c77c 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,15 +19,15 @@ namespace GauXC { -__global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_4( +__global__ __launch_bounds__(128,2) void collocation_device_shell_to_task_kernel_spherical_laplacian_4( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks ) { - __shared__ double alpha[16][detail::shell_nprim_max + 1]; - __shared__ double coeff[16][detail::shell_nprim_max + 1]; + __shared__ double alpha[4][detail::shell_nprim_max + 1]; + __shared__ double coeff[4][detail::shell_nprim_max + 1]; double* my_alpha = alpha[threadIdx.x/32]; double* my_coeff = coeff[threadIdx.x/32]; @@ -66,7 +70,6 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel auto* __restrict__ basis_x_eval = task->dbfx + shoff; auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; - auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; // Loop over points in task @@ -103,64 +106,243 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel radial_eval_alpha *= -2; radial_eval_alpha_squared *= 4; - + // Common Subexpressions + const auto x0 = 0.5*sqrt_35; + const auto x1 = x0*y; + const auto x2 = x*x1; + const auto x3 = x*x; + const auto x4 = x3; + const auto x5 = y*y; + const auto x6 = x5; + const auto x7 = -x6; + const auto x8 = x4 + x7; + const auto x9 = 0.25*sqrt_70; + const auto x10 = x9*z; + const auto x11 = x10*y; + const auto x12 = 3.0*x4; + const auto x13 = x12 + x7; + const auto x14 = 0.5*sqrt_5; + const auto x15 = x14*y; + const auto x16 = x*x15; + const auto x17 = z*z; + const auto x18 = x17; + const auto x19 = -6.0*x18; + const auto x20 = x19 + x6; + const auto x21 = -x20 - x4; + const auto x22 = 0.25*sqrt_10; + const auto x23 = x22*z; + const auto x24 = x23*y; + const auto x25 = -4.0*x18; + const auto x26 = 3.0*x6; + const auto x27 = x25 + x26; + const auto x28 = -x12 - x27; + const auto x29 = 0.125*radial_eval; + const auto x30 = x*x*x*x; + const auto x31 = y*y*y*y; + const auto x32 = 6.0*x4*x6; + const auto x33 = x18*x4; + const auto x34 = x18*x6; + const auto x35 = 3.0*x30 + 3.0*x31 + x32 - 24.0*x33 - 24.0*x34 + 8.0*(z*z*z*z); + const auto x36 = x*x23; + const auto x37 = 0.25*sqrt_5; + const auto x38 = -x30 + x31 + 6.0*x33 - 6.0*x34; + const auto x39 = x*x10; + const auto x40 = -x26; + const auto x41 = x4 + x40; + const auto x42 = x30 + x31 - x32; + const auto x43 = radial_eval*x13; + const auto x44 = x4*x8; + const auto x45 = x*x11; + const auto x46 = 6.0*radial_eval; + const auto x47 = radial_eval_alpha*x13; + const auto x48 = x46 + x47; + const auto x49 = -x12 - x20; + const auto x50 = x21*x4; + const auto x51 = -x46; + const auto x52 = x*x24*(radial_eval_alpha*x28 + x51); + const auto x53 = 12.0*radial_eval; + const auto x54 = x*x*x; + const auto x55 = 4.0*x; + const auto x56 = x*x6 - x18*x55 + x54; + const auto x57 = radial_eval_alpha*x; + const auto x58 = 9.0*x4; + const auto x59 = -x27 - x58; + const auto x60 = x28*x4; + const auto x61 = 4.0*radial_eval; + const auto x62 = 3.0*x; + const auto x63 = x18*x62 - x54; + const auto x64 = x12 + x40; + const auto x65 = radial_eval*x64; + const auto x66 = x4*x41; + const auto x67 = radial_eval_alpha*x66 + x65; + const auto x68 = 0.125*sqrt_35; + const auto x69 = x54 - x6*x62; + const auto x70 = x*x0; + const auto x71 = radial_eval*x41; + const auto x72 = x6*x8; + const auto x73 = x13*x6; + const auto x74 = radial_eval_alpha*x73 + x65; + const auto x75 = x*x14; + const auto x76 = x19 + x26; + const auto x77 = -x4 - x76; + const auto x78 = x21*x6; + const auto x79 = 9.0*x6; + const auto x80 = x12 + x25; + const auto x81 = -x79 - x80; + const auto x82 = x28*x6; + const auto x83 = y*y*y; + const auto x84 = 4.0*y; + const auto x85 = -x18*x84 + x4*y + x83; + const auto x86 = radial_eval_alpha*y; + const auto x87 = 3.0*y; + const auto x88 = -x18*x87 + x83; + const auto x89 = radial_eval_alpha*x41; + const auto x90 = x51 + x89; + const auto x91 = -x4*x87 + x83; + const auto x92 = x1*z; + const auto x93 = x9*y; + const auto x94 = x13*x18; + const auto x95 = x22*y; + const auto x96 = -12.0*x18; + const auto x97 = x26 + x96; + const auto x98 = -x12 - x97; + const auto x99 = x18*x28; + const auto x100 = radial_eval*x98 + radial_eval_alpha*x99; + const auto x101 = 3.0*z; + const auto x102 = -x101*x4 - x101*x6 + 2.0*(z*z*z); + const auto x103 = radial_eval_alpha*z; + const auto x104 = x37*z; + const auto x105 = x53*x8; + const auto x106 = x18*x41; + const auto x107 = 2.0*radial_eval_alpha; + const auto x108 = x107*x13; + const auto x109 = radial_eval_alpha + radial_eval_alpha_squared*x4; + const auto x110 = x108 + x109*x8; + const auto x111 = x109*x13; + const auto x112 = 12.0*radial_eval_alpha; + const auto x113 = x112*x4; + const auto x114 = x113 + x46; + const auto x115 = x107*x49 + x109*x21; + const auto x116 = x109*x35 + x53*(x6 + x80) + 24.0*x56*x57; + const auto x117 = -18.0*radial_eval; + const auto x118 = x109*x28; + const auto x119 = x107*x59 + x118; + const auto x120 = -x4; + const auto x121 = 8.0*x57; + const auto x122 = x109*x38 + x121*x63 + x53*(x120 + x18); + const auto x123 = x107*x64; + const auto x124 = x109*x41 + x123; + const auto x125 = x105 + x109*x42 + x121*x69; + const auto x126 = radial_eval_alpha*x3; + const auto x127 = radial_eval_alpha*x5; + const auto x128 = 6.0*radial_eval_alpha; + const auto x129 = x128*x6; + const auto x130 = radial_eval_alpha*x64; + const auto x131 = 24.0*radial_eval; + const auto x132 = x*x131; + const auto x133 = x132*y; + const auto x134 = 12.0*x57; + const auto x135 = 12.0*x86; + const auto x136 = radial_eval_alpha_squared*x; + const auto x137 = x136*y; + const auto x138 = -x128*x4 + x51; + const auto x139 = radial_eval_alpha*x55; + const auto x140 = radial_eval_alpha*x84; + const auto x141 = x*x93; + const auto x142 = x128*x18; + const auto x143 = -x142; + const auto x144 = x*x95*(radial_eval_alpha*x98 + radial_eval_alpha_squared*x99 + x143 + x51); + const auto x145 = 96.0*radial_eval*z; + const auto x146 = 12.0*x103; + const auto x147 = radial_eval_alpha*x17; + const auto x148 = 4.0*radial_eval_alpha; + const auto x149 = x147*x64; + const auto x150 = x68*z; + const auto x151 = x107*x41; + const auto x152 = radial_eval_alpha + radial_eval_alpha_squared*x6; + const auto x153 = x151 + x152*x8; + const auto x154 = x123 + x13*x152; + const auto x155 = x107*x77 + x152*x21; + const auto x156 = x152*x28; + const auto x157 = x107*x81 + x156; + const auto x158 = x152*x35 + x53*(x27 + x4) + 24.0*x85*x86; + const auto x159 = x112*x6; + const auto x160 = x159 + x46; + const auto x161 = 8.0*x86; + const auto x162 = x152*x38 + x161*x88 - x53*(x18 - x6); + const auto x163 = x152*x42 + x161*x91 + x53*(x120 + x6); + const auto x164 = radial_eval_alpha_squared*y; + const auto x165 = radial_eval_alpha + radial_eval_alpha_squared*x18; + const auto x166 = x165*x8; + const auto x167 = x108 + x13*x165; + const auto x168 = 24.0*radial_eval_alpha*x18 + x165*x21; + const auto x169 = x107*x98 + x165*x28; + const auto x170 = x131 + x169; + const auto x171 = -48.0*radial_eval*(-2.0*x18 + x4 + x6) + 32.0*x102*x103 + x165*x35; + const auto x172 = x105 + 24.0*x147*x8 + x165*x38; + const auto x173 = x151 + x165*x41; + const auto x174 = x165*x42; + const auto x175 = -x159; + // Evaluate basis function - basis_eval[ipt + 0*npts] = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - basis_eval[ipt + 1*npts] = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - basis_eval[ipt + 2*npts] = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - basis_eval[ipt + 3*npts] = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 4*npts] = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - basis_eval[ipt + 5*npts] = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - basis_eval[ipt + 6*npts] = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - basis_eval[ipt + 7*npts] = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; - basis_eval[ipt + 8*npts] = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_eval[ipt + 0*npts] = radial_eval*x2*x8; + basis_eval[ipt + 1*npts] = radial_eval*x11*x13; + basis_eval[ipt + 2*npts] = radial_eval*x16*x21; + basis_eval[ipt + 3*npts] = radial_eval*x24*x28; + basis_eval[ipt + 4*npts] = x29*x35; + basis_eval[ipt + 5*npts] = radial_eval*x28*x36; + basis_eval[ipt + 6*npts] = radial_eval*x37*x38; + basis_eval[ipt + 7*npts] = radial_eval*x39*x41; + basis_eval[ipt + 8*npts] = sqrt_35*x29*x42; // Evaluate first derivative of bfn wrt x - basis_x_eval[ipt + 0*npts] = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; - basis_x_eval[ipt + 1*npts] = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - basis_x_eval[ipt + 2*npts] = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; - basis_x_eval[ipt + 3*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - basis_x_eval[ipt + 4*npts] = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_x_eval[ipt + 5*npts] = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - basis_x_eval[ipt + 6*npts] = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_x_eval[ipt + 7*npts] = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - basis_x_eval[ipt + 8*npts] = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_x_eval[ipt + 0*npts] = x1*(radial_eval_alpha*x44 + x43); + basis_x_eval[ipt + 1*npts] = x45*x48; + basis_x_eval[ipt + 2*npts] = x15*(radial_eval*x49 + radial_eval_alpha*x50); + basis_x_eval[ipt + 3*npts] = x52; + basis_x_eval[ipt + 4*npts] = 0.125*x35*x57 + 0.125*x53*x56; + basis_x_eval[ipt + 5*npts] = x23*(radial_eval*x59 + radial_eval_alpha*x60); + basis_x_eval[ipt + 6*npts] = x37*(x38*x57 + x61*x63); + basis_x_eval[ipt + 7*npts] = x10*x67; + basis_x_eval[ipt + 8*npts] = x68*(x42*x57 + x61*x69); // Evaluate first derivative of bfn wrt y - basis_y_eval[ipt + 0*npts] = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; - basis_y_eval[ipt + 1*npts] = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - basis_y_eval[ipt + 2*npts] = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; - basis_y_eval[ipt + 3*npts] = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - basis_y_eval[ipt + 4*npts] = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_y_eval[ipt + 5*npts] = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - basis_y_eval[ipt + 6*npts] = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_y_eval[ipt + 7*npts] = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - basis_y_eval[ipt + 8*npts] = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; + basis_y_eval[ipt + 0*npts] = x70*(radial_eval_alpha*x72 + x71); + basis_y_eval[ipt + 1*npts] = x10*x74; + basis_y_eval[ipt + 2*npts] = x75*(radial_eval*x77 + radial_eval_alpha*x78); + basis_y_eval[ipt + 3*npts] = x23*(radial_eval*x81 + radial_eval_alpha*x82); + basis_y_eval[ipt + 4*npts] = 0.125*x35*x86 + 0.125*x53*x85; + basis_y_eval[ipt + 5*npts] = x52; + basis_y_eval[ipt + 6*npts] = x37*(x38*x86 + x61*x88); + basis_y_eval[ipt + 7*npts] = x45*x90; + basis_y_eval[ipt + 8*npts] = x68*(x42*x86 + x61*x91); // Evaluate first derivative of bfn wrt z - basis_z_eval[ipt + 0*npts] = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; - basis_z_eval[ipt + 1*npts] = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; - basis_z_eval[ipt + 2*npts] = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; - basis_z_eval[ipt + 3*npts] = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_z_eval[ipt + 4*npts] = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - basis_z_eval[ipt + 5*npts] = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - basis_z_eval[ipt + 6*npts] = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - basis_z_eval[ipt + 7*npts] = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; - basis_z_eval[ipt + 8*npts] = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + basis_z_eval[ipt + 0*npts] = x57*x8*x92; + basis_z_eval[ipt + 1*npts] = x93*(radial_eval_alpha*x94 + x43); + basis_z_eval[ipt + 2*npts] = x16*z*(radial_eval_alpha*x21 + x53); + basis_z_eval[ipt + 3*npts] = x100*x95; + basis_z_eval[ipt + 4*npts] = 2.0*radial_eval*x102 + 0.125*x103*x35; + basis_z_eval[ipt + 5*npts] = x*x100*x22; + basis_z_eval[ipt + 6*npts] = x104*(radial_eval_alpha*x38 + x105); + basis_z_eval[ipt + 7*npts] = x*x9*(radial_eval_alpha*x106 + x71); + basis_z_eval[ipt + 8*npts] = x103*x42*x68; + // Evaluate Laplacian of bfn - basis_lapl_eval[ipt + 0*npts] = sqrt_35*x*y*(11*radial_eval_alpha*x*x - 11*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x + radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/2; - basis_lapl_eval[ipt + 1*npts] = sqrt_70*y*z*(33*radial_eval_alpha*x*x - 11*radial_eval_alpha*y*y + 3*radial_eval_alpha_squared*x*x*x*x + 2*radial_eval_alpha_squared*x*x*y*y + 3*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y - radial_eval_alpha_squared*y*y*z*z)/4; - basis_lapl_eval[ipt + 2*npts] = sqrt_5*x*y*(-11*radial_eval_alpha*x*x - 11*radial_eval_alpha*y*y + 66*radial_eval_alpha*z*z - radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + 5*radial_eval_alpha_squared*x*x*z*z - radial_eval_alpha_squared*y*y*y*y + 5*radial_eval_alpha_squared*y*y*z*z + 6*radial_eval_alpha_squared*z*z*z*z)/2; - basis_lapl_eval[ipt + 3*npts] = sqrt_10*y*z*(-33*radial_eval_alpha*x*x - 33*radial_eval_alpha*y*y + 44*radial_eval_alpha*z*z - 3*radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4; - basis_lapl_eval[ipt + 4*npts] = 33*radial_eval_alpha*x*x*x*x/8 + 33*radial_eval_alpha*x*x*y*y/4 - 33*radial_eval_alpha*x*x*z*z + 33*radial_eval_alpha*y*y*y*y/8 - 33*radial_eval_alpha*y*y*z*z + 11*radial_eval_alpha*z*z*z*z + 3*radial_eval_alpha_squared*x*x*x*x*x*x/8 + 9*radial_eval_alpha_squared*x*x*x*x*y*y/8 - 21*radial_eval_alpha_squared*x*x*x*x*z*z/8 + 9*radial_eval_alpha_squared*x*x*y*y*y*y/8 - 21*radial_eval_alpha_squared*x*x*y*y*z*z/4 - 2*radial_eval_alpha_squared*x*x*z*z*z*z + 3*radial_eval_alpha_squared*y*y*y*y*y*y/8 - 21*radial_eval_alpha_squared*y*y*y*y*z*z/8 - 2*radial_eval_alpha_squared*y*y*z*z*z*z + radial_eval_alpha_squared*z*z*z*z*z*z; - basis_lapl_eval[ipt + 5*npts] = sqrt_10*x*z*(-33*radial_eval_alpha*x*x - 33*radial_eval_alpha*y*y + 44*radial_eval_alpha*z*z - 3*radial_eval_alpha_squared*x*x*x*x - 6*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y + radial_eval_alpha_squared*y*y*z*z + 4*radial_eval_alpha_squared*z*z*z*z)/4; - basis_lapl_eval[ipt + 6*npts] = sqrt_5*(-11*radial_eval_alpha*x*x*x*x + 66*radial_eval_alpha*x*x*z*z + 11*radial_eval_alpha*y*y*y*y - 66*radial_eval_alpha*y*y*z*z - radial_eval_alpha_squared*x*x*x*x*x*x - radial_eval_alpha_squared*x*x*x*x*y*y + 5*radial_eval_alpha_squared*x*x*x*x*z*z + radial_eval_alpha_squared*x*x*y*y*y*y + 6*radial_eval_alpha_squared*x*x*z*z*z*z + radial_eval_alpha_squared*y*y*y*y*y*y - 5*radial_eval_alpha_squared*y*y*y*y*z*z - 6*radial_eval_alpha_squared*y*y*z*z*z*z)/4; - basis_lapl_eval[ipt + 7*npts] = sqrt_70*x*z*(11*radial_eval_alpha*x*x - 33*radial_eval_alpha*y*y + radial_eval_alpha_squared*x*x*x*x - 2*radial_eval_alpha_squared*x*x*y*y + radial_eval_alpha_squared*x*x*z*z - 3*radial_eval_alpha_squared*y*y*y*y - 3*radial_eval_alpha_squared*y*y*z*z)/4; - basis_lapl_eval[ipt + 8*npts] = sqrt_35*(11*radial_eval_alpha*x*x*x*x - 66*radial_eval_alpha*x*x*y*y + 11*radial_eval_alpha*y*y*y*y + radial_eval_alpha_squared*x*x*x*x*x*x - 5*radial_eval_alpha_squared*x*x*x*x*y*y + radial_eval_alpha_squared*x*x*x*x*z*z - 5*radial_eval_alpha_squared*x*x*y*y*y*y - 6*radial_eval_alpha_squared*x*x*y*y*z*z + radial_eval_alpha_squared*y*y*y*y*y*y + radial_eval_alpha_squared*y*y*y*y*z*z)/8; + basis_lapl_eval[ipt + 0*npts] = x2*(x110 + x153 + x166); + basis_lapl_eval[ipt + 1*npts] = x11*(x111 + x113 + x154 + x167); + basis_lapl_eval[ipt + 2*npts] = x16*(x115 + x155 + x168); + basis_lapl_eval[ipt + 3*npts] = x24*(-x113 + x118 + x157 + x169); + basis_lapl_eval[ipt + 4*npts] = 0.125*x116 + 0.125*x158 + 0.125*x171; + basis_lapl_eval[ipt + 5*npts] = x36*(x119 + x156 + x169 + x175); + basis_lapl_eval[ipt + 6*npts] = x37*(x122 + x162 + x172); + basis_lapl_eval[ipt + 7*npts] = x39*(x124 + x152*x41 + x173 + x175); + basis_lapl_eval[ipt + 8*npts] = x68*(x125 + x163 + x174); + @@ -176,25 +358,25 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*x*y*(x*x - y*y)/2; - ang_eval_1 = sqrt_70*radial_eval*y*z*(3*x*x - y*y)/4; - ang_eval_2 = sqrt_5*radial_eval*x*y*(-x*x - y*y + 6*z*z)/2; - ang_eval_3 = sqrt_10*radial_eval*y*z*(-3*x*x - 3*y*y + 4*z*z)/4; + ang_eval_0 = radial_eval*x2*x8; + ang_eval_1 = radial_eval*x11*x13; + ang_eval_2 = radial_eval*x16*x21; + ang_eval_3 = radial_eval*x24*x28; basis_eval[ipt + 0*npts] = ang_eval_0; basis_eval[ipt + 1*npts] = ang_eval_1; basis_eval[ipt + 2*npts] = ang_eval_2; basis_eval[ipt + 3*npts] = ang_eval_3; - ang_eval_0 = radial_eval*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z)/8; - ang_eval_1 = sqrt_10*radial_eval*x*z*(-3*x*x - 3*y*y + 4*z*z)/4; - ang_eval_2 = sqrt_5*radial_eval*(-x*x*x*x + 6*x*x*z*z + y*y*y*y - 6*y*y*z*z)/4; - ang_eval_3 = sqrt_70*radial_eval*x*z*(x*x - 3*y*y)/4; + ang_eval_0 = x29*x35; + ang_eval_1 = radial_eval*x28*x36; + ang_eval_2 = radial_eval*x37*x38; + ang_eval_3 = radial_eval*x39*x41; basis_eval[ipt + 4*npts] = ang_eval_0; basis_eval[ipt + 5*npts] = ang_eval_1; basis_eval[ipt + 6*npts] = ang_eval_2; basis_eval[ipt + 7*npts] = ang_eval_3; - ang_eval_0 = sqrt_35*radial_eval*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + ang_eval_0 = sqrt_35*x29*x42; basis_eval[ipt + 8*npts] = ang_eval_0; @@ -203,18 +385,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel double dang_eval_x_2, dang_eval_y_2, dang_eval_z_2; double dang_eval_x_3, dang_eval_y_3, dang_eval_z_3; - dang_eval_x_0 = sqrt_35*y*(radial_eval*(3*x*x - y*y) + radial_eval_alpha*x*x*(x*x - y*y))/2; - dang_eval_y_0 = sqrt_35*x*(-radial_eval*(-x*x + 3*y*y) + radial_eval_alpha*y*y*(x*x - y*y))/2; - dang_eval_z_0 = sqrt_35*radial_eval_alpha*x*y*z*(x*x - y*y)/2; - dang_eval_x_1 = sqrt_70*x*y*z*(6*radial_eval + radial_eval_alpha*(3*x*x - y*y))/4; - dang_eval_y_1 = sqrt_70*z*(-3*radial_eval*(-x*x + y*y) + radial_eval_alpha*y*y*(3*x*x - y*y))/4; - dang_eval_z_1 = sqrt_70*y*(radial_eval + radial_eval_alpha*z*z)*(3*x*x - y*y)/4; - dang_eval_x_2 = sqrt_5*y*(-radial_eval*(3*x*x + y*y - 6*z*z) - radial_eval_alpha*x*x*(x*x + y*y - 6*z*z))/2; - dang_eval_y_2 = sqrt_5*x*(-radial_eval*(x*x + 3*y*y - 6*z*z) - radial_eval_alpha*y*y*(x*x + y*y - 6*z*z))/2; - dang_eval_z_2 = sqrt_5*x*y*z*(12*radial_eval - radial_eval_alpha*(x*x + y*y - 6*z*z))/2; - dang_eval_x_3 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_y_3 = sqrt_10*z*(-radial_eval*(3*x*x + 9*y*y - 4*z*z) - radial_eval_alpha*y*y*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_z_3 = sqrt_10*y*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; + dang_eval_x_0 = x1*(radial_eval_alpha*x44 + x43); + dang_eval_y_0 = x70*(radial_eval_alpha*x72 + x71); + dang_eval_z_0 = x57*x8*x92; + dang_eval_x_1 = x45*x48; + dang_eval_y_1 = x10*x74; + dang_eval_z_1 = x93*(radial_eval_alpha*x94 + x43); + dang_eval_x_2 = x15*(radial_eval*x49 + radial_eval_alpha*x50); + dang_eval_y_2 = x75*(radial_eval*x77 + radial_eval_alpha*x78); + dang_eval_z_2 = x16*z*(radial_eval_alpha*x21 + x53); + dang_eval_x_3 = x52; + dang_eval_y_3 = x23*(radial_eval*x81 + radial_eval_alpha*x82); + dang_eval_z_3 = x100*x95; basis_x_eval[ipt + 0*npts] = dang_eval_x_0; basis_y_eval[ipt + 0*npts] = dang_eval_y_0; basis_z_eval[ipt + 0*npts] = dang_eval_z_0; @@ -228,18 +410,18 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 3*npts] = dang_eval_y_3; basis_z_eval[ipt + 3*npts] = dang_eval_z_3; - dang_eval_x_0 = x*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_y_0 = y*(12*radial_eval*(x*x + y*y - 4*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_z_0 = z*(-16*radial_eval*(3*x*x + 3*y*y - 2*z*z) + radial_eval_alpha*(3*x*x*x*x + 6*x*x*y*y - 24*x*x*z*z + 3*y*y*y*y - 24*y*y*z*z + 8*z*z*z*z))/8; - dang_eval_x_1 = sqrt_10*z*(-radial_eval*(9*x*x + 3*y*y - 4*z*z) - radial_eval_alpha*x*x*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_y_1 = sqrt_10*x*y*z*(-6*radial_eval - radial_eval_alpha*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_z_1 = sqrt_10*x*(3*radial_eval*(-x*x - y*y + 4*z*z) - radial_eval_alpha*z*z*(3*x*x + 3*y*y - 4*z*z))/4; - dang_eval_x_2 = sqrt_5*x*(-4*radial_eval*(x*x - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_y_2 = sqrt_5*y*(4*radial_eval*(y*y - 3*z*z) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_z_2 = sqrt_5*z*(12*radial_eval*(x*x - y*y) - radial_eval_alpha*(x*x*x*x - 6*x*x*z*z - y*y*y*y + 6*y*y*z*z))/4; - dang_eval_x_3 = sqrt_70*z*(3*radial_eval*(x*x - y*y) + radial_eval_alpha*x*x*(x*x - 3*y*y))/4; - dang_eval_y_3 = sqrt_70*x*y*z*(-6*radial_eval + radial_eval_alpha*(x*x - 3*y*y))/4; - dang_eval_z_3 = sqrt_70*x*(radial_eval + radial_eval_alpha*z*z)*(x*x - 3*y*y)/4; + dang_eval_x_0 = 0.125*x35*x57 + 0.125*x53*x56; + dang_eval_y_0 = 0.125*x35*x86 + 0.125*x53*x85; + dang_eval_z_0 = 2.0*radial_eval*x102 + 0.125*x103*x35; + dang_eval_x_1 = x23*(radial_eval*x59 + radial_eval_alpha*x60); + dang_eval_y_1 = x52; + dang_eval_z_1 = x*x100*x22; + dang_eval_x_2 = x37*(x38*x57 + x61*x63); + dang_eval_y_2 = x37*(x38*x86 + x61*x88); + dang_eval_z_2 = x104*(radial_eval_alpha*x38 + x105); + dang_eval_x_3 = x10*x67; + dang_eval_y_3 = x45*x90; + dang_eval_z_3 = x*x9*(radial_eval_alpha*x106 + x71); basis_x_eval[ipt + 4*npts] = dang_eval_x_0; basis_y_eval[ipt + 4*npts] = dang_eval_y_0; basis_z_eval[ipt + 4*npts] = dang_eval_z_0; @@ -253,9 +435,9 @@ __global__ __launch_bounds__(512,2) void collocation_device_shell_to_task_kernel basis_y_eval[ipt + 7*npts] = dang_eval_y_3; basis_z_eval[ipt + 7*npts] = dang_eval_z_3; - dang_eval_x_0 = sqrt_35*x*(4*radial_eval*(x*x - 3*y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; - dang_eval_y_0 = sqrt_35*y*(-4*radial_eval*(3*x*x - y*y) + radial_eval_alpha*(x*x*x*x - 6*x*x*y*y + y*y*y*y))/8; - dang_eval_z_0 = sqrt_35*radial_eval_alpha*z*(x*x*x*x - 6*x*x*y*y + y*y*y*y)/8; + dang_eval_x_0 = x68*(x42*x57 + x61*x69); + dang_eval_y_0 = x68*(x42*x86 + x61*x91); + dang_eval_z_0 = x103*x42*x68; basis_x_eval[ipt + 8*npts] = dang_eval_x_0; basis_y_eval[ipt + 8*npts] = dang_eval_y_0; basis_z_eval[ipt + 8*npts] = dang_eval_z_0; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu index 7f489cd1..d3380d60 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/deprecated/gaueval_kernels_template.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py index 76a5d4b7..9bedd244 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/scripts/generate_shell_to_task.py @@ -3,6 +3,8 @@ from collocation_angular import generate_spherical_angular, generate_cartesian_angular, generate_cartesian_ls, generate_eval_lines import sympy import itertools +from sympy.printing import ccode +from sympy.codegen.rewriting import create_expand_pow_optimization from io import StringIO @@ -15,7 +17,7 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ): do_hess = bool(deriv_order > 1) [x,y,z,r] = sympy.symbols('x y z r', real=True) - [bf,bf_alpha,bf_alpha_sq] = sympy.symbols('radial_eval radial_eval_alpha radial_eval_alpha_squared',real=True) + [bf,bf_alpha,bf_alpha_sq,bf_alpha_cb] = sympy.symbols('radial_eval radial_eval_alpha radial_eval_alpha_squared radial_eval_alpha_cubed',real=True) bf_x = x * bf_alpha bf_y = y * bf_alpha bf_z = z * bf_alpha @@ -28,6 +30,17 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ): bf_xz = x*z*bf_alpha_sq bf_yz = y*z*bf_alpha_sq + bf_xxx = (x + x + x)*bf_alpha_sq + x*x*x*bf_alpha_cb + bf_xxy = (y + 0 + 0)*bf_alpha_sq + x*x*y*bf_alpha_cb + bf_xxz = (z + 0 + 0)*bf_alpha_sq + x*x*z*bf_alpha_cb + bf_yyx = (x + 0 + 0)*bf_alpha_sq + y*y*x*bf_alpha_cb + bf_yyy = (y + y + y)*bf_alpha_sq + y*y*y*bf_alpha_cb + bf_yyz = (z + 0 + 0)*bf_alpha_sq + y*y*z*bf_alpha_cb + bf_zzx = (x + 0 + 0)*bf_alpha_sq + z*z*x*bf_alpha_cb + bf_zzy = (y + 0 + 0)*bf_alpha_sq + z*z*y*bf_alpha_cb + bf_zzz = (z + z + z)*bf_alpha_sq + z*z*z*bf_alpha_cb + + bf_eval_strs = [] bf_x_eval_strs = [] bf_y_eval_strs = [] @@ -39,6 +52,9 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ): bf_yz_eval_strs = [] bf_zz_eval_strs = [] bf_lap_eval_strs = [] + bf_lap_x_eval_strs = [] + bf_lap_y_eval_strs = [] + bf_lap_z_eval_strs = [] for j in range(len(ang)): a = ang[j] a_x = sympy.diff( a, x ) @@ -52,25 +68,46 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ): a_yz = sympy.diff( a_y, z ) a_zz = sympy.diff( a_z, z ) - bf_eval = sympy.simplify( a * bf ) - bf_x_eval = sympy.simplify( a_x * bf + a * bf_x ) - bf_y_eval = sympy.simplify( a_y * bf + a * bf_y ) - bf_z_eval = sympy.simplify( a_z * bf + a * bf_z ) - - bf_xx_eval = sympy.simplify( a_xx * bf + 2 * a_x * bf_x + a * bf_xx ) - bf_yy_eval = sympy.simplify( a_yy * bf + 2 * a_y * bf_y + a * bf_yy ) - bf_zz_eval = sympy.simplify( a_zz * bf + 2 * a_z * bf_z + a * bf_zz ) + a_xxx = sympy.diff( a_xx, x ) + a_xxy = sympy.diff( a_xx, y ) + a_xxz = sympy.diff( a_xx, z ) + a_yyx = sympy.diff( a_yy, x ) + a_yyy = sympy.diff( a_yy, y ) + a_yyz = sympy.diff( a_yy, z ) + a_zzx = sympy.diff( a_zz, x ) + a_zzy = sympy.diff( a_zz, y ) + a_zzz = sympy.diff( a_zz, z ) + + bf_eval = a * bf + bf_x_eval = a_x * bf + a * bf_x + bf_y_eval = a_y * bf + a * bf_y + bf_z_eval = a_z * bf + a * bf_z + + bf_xx_eval = a_xx * bf + 2 * a_x * bf_x + a * bf_xx + bf_yy_eval = a_yy * bf + 2 * a_y * bf_y + a * bf_yy + bf_zz_eval = a_zz * bf + 2 * a_z * bf_z + a * bf_zz + + bf_lap_eval = bf_xx_eval + bf_yy_eval + bf_zz_eval + + bf_xy_eval = a_xy * bf + a_x * bf_y + a_y * bf_x + a * bf_xy + bf_xz_eval = a_xz * bf + a_x * bf_z + a_z * bf_x + a * bf_xz + bf_yz_eval = a_yz * bf + a_y * bf_z + a_z * bf_y + a * bf_yz + + bf_xxx_eval = a_xxx * bf + 3 * (a_xx * bf_x + a_x * bf_xx) + a * bf_xxx + bf_yyy_eval = a_yyy * bf + 3 * (a_yy * bf_y + a_y * bf_yy) + a * bf_yyy + bf_zzz_eval = a_zzz * bf + 3 * (a_zz * bf_z + a_z * bf_zz) + a * bf_zzz + + bf_xxy_eval = a_xxy * bf + 2*a_xy*bf_x + a_xx*bf_y + 2*bf_xy*a_x + bf_xx*a_y + a* bf_xxy + bf_xxz_eval = a_xxz * bf + 2*a_xz*bf_x + a_xx*bf_z + 2*bf_xz*a_x + bf_xx*a_z + a* bf_xxz + bf_yyx_eval = a_yyx * bf + 2*a_xy*bf_y + a_yy*bf_x + 2*bf_xy*a_y + bf_yy*a_x + a* bf_yyx + bf_yyz_eval = a_yyz * bf + 2*a_yz*bf_y + a_yy*bf_z + 2*bf_yz*a_y + bf_yy*a_z + a* bf_yyz + bf_zzx_eval = a_zzx * bf + 2*a_xz*bf_z + a_zz*bf_x + 2*bf_xz*a_z + bf_zz*a_x + a* bf_zzx + bf_zzy_eval = a_zzy * bf + 2*a_yz*bf_z + a_zz*bf_y + 2*bf_yz*a_z + bf_zz*a_y + a* bf_zzy + + bf_lap_x_eval = bf_xxx_eval + bf_yyx_eval + bf_zzx_eval + bf_lap_y_eval = bf_xxy_eval + bf_yyy_eval + bf_zzy_eval + bf_lap_z_eval = bf_xxz_eval + bf_yyz_eval + bf_zzz_eval - bf_lap_eval = sympy.simplify(bf_xx_eval + bf_yy_eval + bf_zz_eval) - - bf_xy_eval = sympy.simplify( a_xy * bf + a_x * bf_y + a_y * bf_x + a * bf_xy ) - bf_xz_eval = sympy.simplify( a_xz * bf + a_x * bf_z + a_z * bf_x + a * bf_xz ) - bf_yz_eval = sympy.simplify( a_yz * bf + a_y * bf_z + a_z * bf_y + a * bf_yz ) - - #bf_eval_str = 'ang_eval = {};'.format(bf_eval) - #bf_x_eval_str = 'dang_eval_x = {};'.format(bf_x_eval) - #bf_y_eval_str = 'dang_eval_y = {};'.format(bf_y_eval) - #bf_z_eval_str = 'dang_eval_z = {};'.format(bf_z_eval) bf_eval_str = '{}'.format(bf_eval ) bf_x_eval_str = '{}'.format(bf_x_eval) bf_y_eval_str = '{}'.format(bf_y_eval) @@ -85,26 +122,10 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ): bf_lap_eval_str = '{}'.format(bf_lap_eval) - for k in range(2,L+3): - for X in ('x','y','z'): - pow_str = X + '**' + str(k) - repl_str = '' - for K in range(k-1): repl_str = repl_str + X + '*' - repl_str = repl_str + X - - bf_eval_str = bf_eval_str.replace(pow_str,repl_str) - bf_x_eval_str = bf_x_eval_str.replace(pow_str,repl_str) - bf_y_eval_str = bf_y_eval_str.replace(pow_str,repl_str) - bf_z_eval_str = bf_z_eval_str.replace(pow_str,repl_str) - - bf_xx_eval_str = bf_xx_eval_str.replace(pow_str,repl_str) - bf_xy_eval_str = bf_xy_eval_str.replace(pow_str,repl_str) - bf_xz_eval_str = bf_xz_eval_str.replace(pow_str,repl_str) - bf_yy_eval_str = bf_yy_eval_str.replace(pow_str,repl_str) - bf_yz_eval_str = bf_yz_eval_str.replace(pow_str,repl_str) - bf_zz_eval_str = bf_zz_eval_str.replace(pow_str,repl_str) - - bf_lap_eval_str = bf_lap_eval_str.replace(pow_str,repl_str) + bf_lap_x_eval_str = '{}'.format(bf_lap_x_eval) + bf_lap_y_eval_str = '{}'.format(bf_lap_y_eval) + bf_lap_z_eval_str = '{}'.format(bf_lap_z_eval) + bf_eval_strs.append( bf_eval_str ) bf_x_eval_strs.append( bf_x_eval_str ) bf_y_eval_strs.append( bf_y_eval_str ) @@ -118,11 +139,16 @@ def generate_shell_to_task_lines( ang, deriv_order = 0 ): bf_zz_eval_strs.append( bf_zz_eval_str ) bf_lap_eval_strs.append( bf_lap_eval_str ) + bf_lap_x_eval_strs.append( bf_lap_x_eval_str ) + bf_lap_y_eval_strs.append( bf_lap_y_eval_str ) + bf_lap_z_eval_strs.append( bf_lap_z_eval_str ) if deriv_order == 0: return bf_eval_strs elif deriv_order == 1: return [bf_x_eval_strs, bf_y_eval_strs, bf_z_eval_strs] elif deriv_order == 2: return [bf_xx_eval_strs, bf_xy_eval_strs, bf_xz_eval_strs, bf_yy_eval_strs, bf_yz_eval_strs, bf_zz_eval_strs, bf_lap_eval_strs] + elif deriv_order == 3: + return [bf_lap_x_eval_strs, bf_lap_y_eval_strs, bf_lap_z_eval_strs] @@ -131,7 +157,7 @@ def get_constant_lines( lines ): constant_lines = [] # Sqrts - sqrt_regex = "sqrt\([0-9]+\)" + sqrt_regex = 'sqrt\([0-9]+\)' sqrt_finds = list(set(re.findall( sqrt_regex, "\n".join(lines) ))) # Replace locally @@ -146,7 +172,7 @@ def get_constant_lines( lines ): def sanitize_constants( lines ): # Sqrts - sqrt_regex = "sqrt\([0-9]+\)" + sqrt_regex = 'sqrt\([0-9]+\)' sqrt_finds = list(set(re.findall( sqrt_regex, "\n".join(lines) ))) for x in sqrt_finds: @@ -174,6 +200,9 @@ def sanitize_constants( lines ): cart_bfyz_lines = [] cart_bfzz_lines = [] cart_bflap_lines = [] +cart_bflap_x_lines = [] +cart_bflap_y_lines = [] +cart_bflap_z_lines = [] sph_bfxx_lines = [] sph_bfxy_lines = [] sph_bfxz_lines = [] @@ -181,6 +210,9 @@ def sanitize_constants( lines ): sph_bfyz_lines = [] sph_bfzz_lines = [] sph_bflap_lines = [] +sph_bflap_x_lines = [] +sph_bflap_y_lines = [] +sph_bflap_z_lines = [] for L in range( L_max + 1 ): print("Workding on L = ", L) @@ -219,6 +251,16 @@ def sanitize_constants( lines ): sph_bfzz_lines.append(bfzz) sph_bflap_lines.append(bflap) + [bflap_x, bflap_y, bflap_z] = generate_shell_to_task_lines(cart_ang,3) + cart_bflap_x_lines.append(bflap_x) + cart_bflap_y_lines.append(bflap_y) + cart_bflap_z_lines.append(bflap_z) + + [bflap_x, bflap_y, bflap_z] = generate_shell_to_task_lines(sph_ang,3) + sph_bflap_x_lines.append(bflap_x) + sph_bflap_y_lines.append(bflap_y) + sph_bflap_z_lines.append(bflap_z) + constant_lines = [] for lines in itertools.chain( cart_bf_lines, sph_bf_lines ): @@ -227,60 +269,49 @@ def sanitize_constants( lines ): constant_lines.append(line) -# Sanitize wrt constants -for i,lines in enumerate(cart_bf_lines): - cart_bf_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bf_lines): - sph_bf_lines[i] = sanitize_constants( lines ) - -for i,lines in enumerate(cart_bfx_lines): - cart_bfx_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfy_lines): - cart_bfy_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfz_lines): - cart_bfz_lines[i] = sanitize_constants( lines ) - -for i,lines in enumerate(sph_bfx_lines): - sph_bfx_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfy_lines): - sph_bfy_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfz_lines): - sph_bfz_lines[i] = sanitize_constants( lines ) - -for i,lines in enumerate(cart_bfxx_lines): - cart_bfxx_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfxy_lines): - cart_bfxy_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfxz_lines): - cart_bfxz_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfyy_lines): - cart_bfyy_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfyz_lines): - cart_bfyz_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bfzz_lines): - cart_bfzz_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(cart_bflap_lines): - cart_bflap_lines[i] = sanitize_constants( lines ) - -for i,lines in enumerate(sph_bfxx_lines): - sph_bfxx_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfxy_lines): - sph_bfxy_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfxz_lines): - sph_bfxz_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfyy_lines): - sph_bfyy_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfyz_lines): - sph_bfyz_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bfzz_lines): - sph_bfzz_lines[i] = sanitize_constants( lines ) -for i,lines in enumerate(sph_bflap_lines): - sph_bflap_lines[i] = sanitize_constants( lines ) +def perform_cse_and_cleanup(eval_line_list): + expand_opt = create_expand_pow_optimization(20) + + for i in range(len(eval_line_list)): + if len(eval_line_list[0]) != len(eval_line_list[i]): + raise RuntimeError("Eval lines are not uniform length") + + # Concatenate lists + prim_len = len(eval_line_list[0]) + big_list = [] + for i in range(len(eval_line_list)): + for x in eval_line_list[i]: big_list.append(x) + + # Sanitize constants + big_list = sanitize_constants(big_list) + + # Parse to SymPy expressions + big_list = [sympy.parse_expr(x) for x in big_list] + + # Apply expand opt + big_list = [expand_opt(x) for x in big_list] + + # Perform CSE + (common_lines, big_list) = sympy.cse(big_list, optimizations='basic') + + # Sanitize output lines + big_list = [ccode(expand_opt(sympy.simplify(x.evalf()))) for x in big_list] + common_lines = [ (x,ccode(expand_opt(sympy.simplify(y.evalf())))) for (x,y) in common_lines ] + + # Split big list + for i in range(len(eval_line_list)): + eval_line_list[i] = big_list[i*prim_len:(i+1)*prim_len] + + return (common_lines,eval_line_list) + + def generate_code( eval_lines, L, eval_type, template_fname, output_fname ): old_sysout = sys.stdout - var_dict = { 'eval_lines' : eval_lines, 'L' : L, 'type' : eval_type } + common_lines, eval_lines = perform_cse_and_cleanup([eval_lines]) + eval_lines = eval_lines[0] + var_dict = { 'common_lines': common_lines, 'eval_lines' : eval_lines, 'L' : L, 'type' : eval_type, 'nt' : 512 } sys.stdout = expand = StringIO() expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True ) expand = expand.getvalue() @@ -291,11 +322,20 @@ def generate_code( eval_lines, L, eval_type, template_fname, output_fname ): def generate_code_gradient( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, L, eval_type, template_fname, output_fname ): old_sysout = sys.stdout - var_dict = { 'eval_lines' : eval_lines, + + common_lines, big_list = perform_cse_and_cleanup([eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz]) + eval_lines = big_list[0] + eval_lines_dx = big_list[1] + eval_lines_dy = big_list[2] + eval_lines_dz = big_list[3] + + var_dict = { 'common_lines': common_lines, + 'eval_lines' : eval_lines, 'eval_lines_dx' : eval_lines_dx, 'eval_lines_dy' : eval_lines_dy, 'eval_lines_dz' : eval_lines_dz, - 'L' : L, 'type' : eval_type } + 'L' : L, 'type' : eval_type, + 'nt' : 512 if L < 1 else 256 if L < 4 else 128} sys.stdout = expand = StringIO() expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True ) expand = expand.getvalue() @@ -306,7 +346,65 @@ def generate_code_gradient( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines def generate_code_hessian( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap, L, eval_type, template_fname, output_fname ): old_sysout = sys.stdout - var_dict = { 'eval_lines' : eval_lines, + big_list = [eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap] + common_lines, big_list = perform_cse_and_cleanup(big_list) + eval_lines = big_list[0] + eval_lines_dx = big_list[1] + eval_lines_dy = big_list[2] + eval_lines_dz = big_list[3] + eval_lines_dxx = big_list[4] + eval_lines_dxy = big_list[5] + eval_lines_dxz = big_list[6] + eval_lines_dyy = big_list[7] + eval_lines_dyz = big_list[8] + eval_lines_dzz = big_list[9] + eval_lines_lap = big_list[10] + + var_dict = { 'common_lines' : common_lines, + 'eval_lines' : eval_lines, + 'eval_lines_dx' : eval_lines_dx, + 'eval_lines_dy' : eval_lines_dy, + 'eval_lines_dz' : eval_lines_dz, + 'eval_lines_dxx' : eval_lines_dxx, + 'eval_lines_dxy' : eval_lines_dxy, + 'eval_lines_dxz' : eval_lines_dxz, + 'eval_lines_dyy' : eval_lines_dyy, + 'eval_lines_dyz' : eval_lines_dyz, + 'eval_lines_dzz' : eval_lines_dzz, + 'eval_lines_lapl' : eval_lines_lap, + 'L' : L, 'type' : eval_type, + 'nt' : 256 if L < 1 else 128 } + sys.stdout = expand = StringIO() + expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True ) + expand = expand.getvalue() + sys.stdout = old_sysout + + output_file = open(output_fname, 'w') + output_file.write(expand) + + + +def generate_code_lapgrad( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap, eval_lines_lapx, eval_lines_lapy, eval_lines_lapz, L, eval_type, template_fname, output_fname ): + old_sysout = sys.stdout + big_list = [eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_dz, eval_lines_dxx, eval_lines_dxy, eval_lines_dxz, eval_lines_dyy, eval_lines_dyz, eval_lines_dzz, eval_lines_lap, eval_lines_lapx, eval_lines_lapy, eval_lines_lapz] + common_lines, big_list = perform_cse_and_cleanup(big_list) + eval_lines = big_list[0] + eval_lines_dx = big_list[1] + eval_lines_dy = big_list[2] + eval_lines_dz = big_list[3] + eval_lines_dxx = big_list[4] + eval_lines_dxy = big_list[5] + eval_lines_dxz = big_list[6] + eval_lines_dyy = big_list[7] + eval_lines_dyz = big_list[8] + eval_lines_dzz = big_list[9] + eval_lines_lap = big_list[10] + eval_lines_lapx = big_list[11] + eval_lines_lapy = big_list[12] + eval_lines_lapz = big_list[13] + + var_dict = { 'common_lines' : common_lines, + 'eval_lines' : eval_lines, 'eval_lines_dx' : eval_lines_dx, 'eval_lines_dy' : eval_lines_dy, 'eval_lines_dz' : eval_lines_dz, @@ -317,7 +415,11 @@ def generate_code_hessian( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_ 'eval_lines_dyz' : eval_lines_dyz, 'eval_lines_dzz' : eval_lines_dzz, 'eval_lines_lapl' : eval_lines_lap, - 'L' : L, 'type' : eval_type } + 'eval_lines_lapl_x' : eval_lines_lapx, + 'eval_lines_lapl_y' : eval_lines_lapy, + 'eval_lines_lapl_z' : eval_lines_lapz, + 'L' : L, 'type' : eval_type, + 'nt' : 256 if L < 1 else 128 } sys.stdout = expand = StringIO() expander.expandFile( template_fname, external_definitions = var_dict, auto_indent = True ) expand = expand.getvalue() @@ -362,6 +464,17 @@ def generate_code_hessian( eval_lines, eval_lines_dx, eval_lines_dy, eval_lines_ sph_bfxx_lines[L], sph_bfxy_lines[L], sph_bfxz_lines[L], sph_bfyy_lines[L], sph_bfyz_lines[L], sph_bfzz_lines[L], sph_bflap_lines[L], L, 'spherical_laplacian', template_fname, sph_header_fname ) + cart_header_fname = "collocation_shell_to_task_kernels_cartesian_l" + str(L) + "_lapgrad.hpp" + sph_header_fname = "collocation_shell_to_task_kernels_spherical_l" + str(L) + "_lapgrad.hpp" + generate_code_lapgrad( cart_bf_lines[L], cart_bfx_lines[L], cart_bfy_lines[L], cart_bfz_lines[L], + cart_bfxx_lines[L], cart_bfxy_lines[L], cart_bfxz_lines[L], cart_bfyy_lines[L], cart_bfyz_lines[L], + cart_bfzz_lines[L], cart_bflap_lines[L], cart_bflap_x_lines[L], cart_bflap_y_lines[L], cart_bflap_z_lines[L], + L, 'cartesian_lapgrad', template_fname, cart_header_fname ) + generate_code_lapgrad( sph_bf_lines[L], sph_bfx_lines[L], sph_bfy_lines[L], sph_bfz_lines[L], + sph_bfxx_lines[L], sph_bfxy_lines[L], sph_bfxz_lines[L], sph_bfyy_lines[L], sph_bfyz_lines[L], + sph_bfzz_lines[L], sph_bflap_lines[L], sph_bflap_x_lines[L], sph_bflap_y_lines[L], sph_bflap_z_lines[L], + L, 'spherical_lapgrad', template_fname, sph_header_fname ) + #template_fname = 'templates/collocation_shell_to_task_combined_kernels.hpp' #cart_header_fname = "collocation_shell_to_task_combined_kernels_cartesian_l" + str(L) + ".hpp" diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp index e92ec0b0..0816560a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_angular_template.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp index 3a62fef3..f76c6863 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_constants_template.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu index 62557401..f28cadee 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_device_template.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -348,7 +352,7 @@ template void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l, bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { - dim3 threads = max_threads_shell_to_task_collocation(l,pure); + dim3 threads = max_threads_shell_to_task_collocation_gradient(l,pure); int nwarp_per_block = threads.x / cuda::warp_size; int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); dim3 block(n_task_blocks, 1, nshells); @@ -425,7 +429,7 @@ template void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { - dim3 threads = max_threads_shell_to_task_collocation(l,pure); + dim3 threads = max_threads_shell_to_task_collocation_hessian(l,pure); int nwarp_per_block = threads.x / cuda::warp_size; int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); dim3 block(n_task_blocks, 1, nshells); @@ -506,7 +510,7 @@ template void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l, bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { - dim3 threads = max_threads_shell_to_task_collocation(l,pure); + dim3 threads = max_threads_shell_to_task_collocation_laplacian(l,pure); int nwarp_per_block = threads.x / cuda::warp_size; int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); dim3 block(n_task_blocks, 1, nshells); @@ -561,6 +565,89 @@ void eval_collocation_shell_to_task_laplacian( } +uint32_t max_threads_shell_to_task_collocation_lapgrad( int32_t l, bool pure ) { + if( pure ) { + switch(l) { + case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_0 );\ + $for( L in range(1, L_max + 1) ) + case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_$(L) ); + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } else { + switch(l) {\ + $for( L in range(L_max + 1) ) + case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_$(L) );\ + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } + return 0; +} + + + + + +template +void dispatch_shell_to_task_collocation_lapgrad( cudaStream_t stream, int32_t l, + bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { + + dim3 threads = max_threads_shell_to_task_collocation_lapgrad(l,pure); + int nwarp_per_block = threads.x / cuda::warp_size; + int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); + dim3 block(n_task_blocks, 1, nshells); + + if( pure ) { + switch(l) { + case 0: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_0<<>>( nshells, std::forward(args)... ); + break; + $for( L in range(1, L_max + 1) ) + case $(L): + collocation_device_shell_to_task_kernel_spherical_lapgrad_$(L)<<>>( nshells, std::forward(args)... ); + break;\ + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } else { + switch(l) {\ + $for( L in range(0, L_max + 1) ) + case $(L): + collocation_device_shell_to_task_kernel_cartesian_lapgrad_$(L)<<>>( nshells, std::forward(args)... ); + break;\ + $endfor + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)"); + } + } + +} + + + +void eval_collocation_shell_to_task_lapgrad( + uint32_t max_l, + AngularMomentumShellToTaskBatch* l_batched_shell_to_task, + XCDeviceTask* device_tasks, + device_queue queue +) { + + cudaStream_t stream = queue.queue_as() ; + + for( auto l = 0u; l <= max_l; ++l ) { + auto pure = l_batched_shell_to_task[l].pure; + auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device; + auto nshells = l_batched_shell_to_task[l].nshells_in_batch; + auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average); + dispatch_shell_to_task_collocation_lapgrad( stream, l, pure, + ntask_average, nshells, shell_to_task_device, device_tasks ); + auto stat = cudaGetLastError(); + GAUXC_CUDA_ERROR("LAP", stat); + } + + +} + } // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp index 4b223611..7cc19871 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -14,10 +18,10 @@ namespace GauXC { -$py(do_grad = 'gradient' in type or 'hessian' in type or 'lapl' in type)\ -$py(do_hess = 'hessian' in type)\ -$py(do_lapl = 'lapl' in type)\ -$py(nt = 512)\ +$py(do_grad = 'gradient' in type or 'hessian' in type or 'lapl' in type or 'lapgrad' in type)\ +$py(do_hess = 'hessian' in type or 'lapgrad' in type)\ +$py(do_lapl = 'lapl' in type or 'lapgrad' in type)\ +$py(do_lapl_grad = 'lapgrad' in type)\ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kernel_$(type)_$(L)( uint32_t nshell, @@ -72,7 +76,6 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern auto* __restrict__ basis_y_eval = task->dbfy + shoff; auto* __restrict__ basis_z_eval = task->dbfz + shoff; $endif\ - $if( do_hess )\ auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; @@ -84,6 +87,11 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern $if( do_lapl )\ auto* __restrict__ basis_lapl_eval = task->d2bflapl + shoff; $endif\ +$if( do_lapl_grad )\ + auto* __restrict__ basis_lapl_x_eval = task->d3bflapl_x + shoff; + auto* __restrict__ basis_lapl_y_eval = task->d3bflapl_y + shoff; + auto* __restrict__ basis_lapl_z_eval = task->d3bflapl_z + shoff; +$endif\ // Loop over points in task // Assign each point to separate thread within the warp @@ -109,6 +117,9 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern $if( do_hess or do_lapl)\ double radial_eval_alpha_squared = 0.; $endif\ +$if( do_lapl_grad)\ + double radial_eval_alpha_cubed = 0.; +$endif\ #pragma unroll 1 for( uint32_t i = 0; i < nprim; ++i ) { @@ -121,6 +132,9 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern $endif\ $if( do_hess or do_lapl)\ radial_eval_alpha_squared += a * a * e; +$endif\ +$if( do_lapl_grad)\ + radial_eval_alpha_cubed += a * a * a * e; $endif\ } @@ -130,8 +144,14 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern $if( do_hess or do_lapl)\ radial_eval_alpha_squared *= 4; $endif\ +$if( do_lapl_grad )\ + radial_eval_alpha_cubed *= -8; +$endif\ - + // Common Subexpressions +$for( i in range(len(common_lines)) )\ + const auto $(common_lines[i][0]) = $(common_lines[i][1]); +$endfor // Evaluate basis function $for( j in range(len(eval_lines)) )\ @@ -187,13 +207,29 @@ __global__ __launch_bounds__($(nt),2) void collocation_device_shell_to_task_kern basis_zz_eval[ipt + $(j)*npts] = $(eval_lines_dzz[j]); $endfor\ $endif\ + $if(do_lapl)\ // Evaluate Laplacian of bfn -$for( j in range(len(eval_lines_dx)) )\ +$for( j in range(len(eval_lines_lapl)) )\ basis_lapl_eval[ipt + $(j)*npts] = $(eval_lines_lapl[j]); $endfor\ $endif\ +$if(do_lapl_grad)\ + // Evaluate Laplacian gradient of bfn (dx) +$for( j in range(len(eval_lines_lapl_x)) )\ + basis_lapl_x_eval[ipt + $(j)*npts] = $(eval_lines_lapl_x[j]); +$endfor\ + // Evaluate Laplacian gradient of bfn (dy) +$for( j in range(len(eval_lines_lapl_y)) )\ + basis_lapl_y_eval[ipt + $(j)*npts] = $(eval_lines_lapl_y[j]); +$endfor\ + // Evaluate Laplacian gradient of bfn (dz) +$for( j in range(len(eval_lines_lapl_z)) )\ + basis_lapl_z_eval[ipt + $(j)*npts] = $(eval_lines_lapl_z[j]); +$endfor\ +$endif\ + diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp index 544554b3..a699d9e6 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_shell_to_task_kernels_template.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -23,6 +27,10 @@ #include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_laplacian.hpp"\ $endfor +$for( L in range(L_max + 1)) +#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_lapgrad.hpp"\ +$endfor + $for( L in range(L_max + 1)) #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L).hpp"\ $endfor @@ -38,3 +46,7 @@ $for( L in range(L_max + 1)) #include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_laplacian.hpp"\ $endfor + +$for( L in range(L_max + 1)) +#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_lapgrad.hpp"\ +$endfor diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp index 4cb41a3b..abb281f4 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation/templates/collocation_task_to_shell.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu index 5ed615fb..d01b4d8b 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_device.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -372,7 +376,7 @@ template void dispatch_shell_to_task_collocation_gradient( cudaStream_t stream, int32_t l, bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { - dim3 threads = max_threads_shell_to_task_collocation(l,pure); + dim3 threads = max_threads_shell_to_task_collocation_gradient(l,pure); int nwarp_per_block = threads.x / cuda::warp_size; int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); dim3 block(n_task_blocks, 1, nshells); @@ -469,7 +473,7 @@ template void dispatch_shell_to_task_collocation_hessian( cudaStream_t stream, int32_t l, bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { - dim3 threads = max_threads_shell_to_task_collocation(l,pure); + dim3 threads = max_threads_shell_to_task_collocation_hessian(l,pure); int nwarp_per_block = threads.x / cuda::warp_size; int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); dim3 block(n_task_blocks, 1, nshells); @@ -539,39 +543,42 @@ void eval_collocation_shell_to_task_hessian( } - - - - - uint32_t max_threads_shell_to_task_collocation_laplacian( int32_t l, bool pure ) { if( pure ) { switch(l) { - case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 ); + case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 ); case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_1 ); + case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_2 ); + case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { - switch(l) { - case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 ); - case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_1 ); - case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_2 ); - case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_3 ); - case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_4 ); + switch(l) { + case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 ); + case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_1 ); + case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_2 ); + case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_4 ); default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } return 0; } + + + + template -void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l, +void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l, bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { - dim3 threads = max_threads_shell_to_task_collocation(l,pure); + dim3 threads = max_threads_shell_to_task_collocation_laplacian(l,pure); int nwarp_per_block = threads.x / cuda::warp_size; int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); dim3 block(n_task_blocks, 1, nshells); @@ -581,37 +588,38 @@ void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t case 0: collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<>>( nshells, std::forward(args)... ); break; + case 1: collocation_device_shell_to_task_kernel_spherical_laplacian_1<<>>( nshells, std::forward(args)... ); - break; + break; case 2: collocation_device_shell_to_task_kernel_spherical_laplacian_2<<>>( nshells, std::forward(args)... ); - break; + break; case 3: collocation_device_shell_to_task_kernel_spherical_laplacian_3<<>>( nshells, std::forward(args)... ); - break; + break; case 4: collocation_device_shell_to_task_kernel_spherical_laplacian_4<<>>( nshells, std::forward(args)... ); - break; + break; default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } else { - switch(l) { + switch(l) { case 0: collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<>>( nshells, std::forward(args)... ); - break; + break; case 1: collocation_device_shell_to_task_kernel_cartesian_laplacian_1<<>>( nshells, std::forward(args)... ); - break; + break; case 2: collocation_device_shell_to_task_kernel_cartesian_laplacian_2<<>>( nshells, std::forward(args)... ); - break; + break; case 3: collocation_device_shell_to_task_kernel_cartesian_laplacian_3<<>>( nshells, std::forward(args)... ); - break; + break; case 4: collocation_device_shell_to_task_kernel_cartesian_laplacian_4<<>>( nshells, std::forward(args)... ); - break; + break; default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); } } @@ -619,11 +627,12 @@ void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t } + void eval_collocation_shell_to_task_laplacian( uint32_t max_l, AngularMomentumShellToTaskBatch* l_batched_shell_to_task, XCDeviceTask* device_tasks, - device_queue queue + device_queue queue ) { cudaStream_t stream = queue.queue_as() ; @@ -633,7 +642,7 @@ void eval_collocation_shell_to_task_laplacian( auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device; auto nshells = l_batched_shell_to_task[l].nshells_in_batch; auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average); - dispatch_shell_to_task_collocation_laplacian( stream, l, pure, + dispatch_shell_to_task_collocation_laplacian( stream, l, pure, ntask_average, nshells, shell_to_task_device, device_tasks ); auto stat = cudaGetLastError(); GAUXC_CUDA_ERROR("LAP", stat); @@ -642,6 +651,113 @@ void eval_collocation_shell_to_task_laplacian( } +uint32_t max_threads_shell_to_task_collocation_lapgrad( int32_t l, bool pure ) { + if( pure ) { + switch(l) { + case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_0 ); + case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_1 ); + + case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_2 ); + + case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_3 ); + + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_lapgrad_4 ); + + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); + } + } else { + switch(l) { + case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_0 ); + case 1: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_1 ); + case 2: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_2 ); + case 3: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_3 ); + case 4: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_lapgrad_4 ); + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); + } + } + return 0; +} + + + + + +template +void dispatch_shell_to_task_collocation_lapgrad( cudaStream_t stream, int32_t l, + bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) { + + dim3 threads = max_threads_shell_to_task_collocation_lapgrad(l,pure); + int nwarp_per_block = threads.x / cuda::warp_size; + int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block ); + dim3 block(n_task_blocks, 1, nshells); + + if( pure ) { + switch(l) { + case 0: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_0<<>>( nshells, std::forward(args)... ); + break; + + case 1: + collocation_device_shell_to_task_kernel_spherical_lapgrad_1<<>>( nshells, std::forward(args)... ); + break; + case 2: + collocation_device_shell_to_task_kernel_spherical_lapgrad_2<<>>( nshells, std::forward(args)... ); + break; + case 3: + collocation_device_shell_to_task_kernel_spherical_lapgrad_3<<>>( nshells, std::forward(args)... ); + break; + case 4: + collocation_device_shell_to_task_kernel_spherical_lapgrad_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); + } + } else { + switch(l) { + case 0: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_0<<>>( nshells, std::forward(args)... ); + break; + case 1: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_1<<>>( nshells, std::forward(args)... ); + break; + case 2: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_2<<>>( nshells, std::forward(args)... ); + break; + case 3: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_3<<>>( nshells, std::forward(args)... ); + break; + case 4: + collocation_device_shell_to_task_kernel_cartesian_lapgrad_4<<>>( nshells, std::forward(args)... ); + break; + default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = 4"); + } + } + +} + + + +void eval_collocation_shell_to_task_lapgrad( + uint32_t max_l, + AngularMomentumShellToTaskBatch* l_batched_shell_to_task, + XCDeviceTask* device_tasks, + device_queue queue +) { + + cudaStream_t stream = queue.queue_as() ; + + for( auto l = 0u; l <= max_l; ++l ) { + auto pure = l_batched_shell_to_task[l].pure; + auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device; + auto nshells = l_batched_shell_to_task[l].nshells_in_batch; + auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average); + dispatch_shell_to_task_collocation_lapgrad( stream, l, pure, + ntask_average, nshells, shell_to_task_device, device_tasks ); + auto stat = cudaGetLastError(); + GAUXC_CUDA_ERROR("LAPGRAD", stat); + } + + +} diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp index b401c126..dcc42625 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_combined_kernels.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp index 2a17c7d2..ecda9d2b 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_masked_kernels.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp index a5a725a8..e18494b8 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/collocation_shell_to_task_kernels.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -28,6 +32,7 @@ #include "collocation/collocation_shell_to_task_kernels_cartesian_l3_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l4_hessian.hpp" + #include "collocation/collocation_shell_to_task_kernels_cartesian_l0_laplacian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l1_laplacian.hpp" #include "collocation/collocation_shell_to_task_kernels_cartesian_l2_laplacian.hpp" @@ -35,6 +40,13 @@ #include "collocation/collocation_shell_to_task_kernels_cartesian_l4_laplacian.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l0_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l1_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l2_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l3_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_cartesian_l4_lapgrad.hpp" + + #include "collocation/collocation_shell_to_task_kernels_spherical_l0.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2.hpp" @@ -55,8 +67,16 @@ #include "collocation/collocation_shell_to_task_kernels_spherical_l3_hessian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l4_hessian.hpp" + #include "collocation/collocation_shell_to_task_kernels_spherical_l0_laplacian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l1_laplacian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l2_laplacian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l3_laplacian.hpp" #include "collocation/collocation_shell_to_task_kernels_spherical_l4_laplacian.hpp" + + +#include "collocation/collocation_shell_to_task_kernels_spherical_l0_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l1_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l2_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l3_lapgrad.hpp" +#include "collocation/collocation_shell_to_task_kernels_spherical_l4_lapgrad.hpp" diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu index ee7c7746..947d7b18 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cublas_extensions.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -45,6 +49,24 @@ void increment( const T* X, T* Y, cudaStream_t stream ) { increment_kernel<<<1,1,0,stream>>>(X,Y); } +template +__global__ void increment_vec_kernel( const T* X, T* Y, int N ) { + const auto tid = blockIdx.x * blockDim.x + threadIdx.x; + if( tid < N ) Y[tid] += X[tid]; +} + +template +void increment( device_blas_handle generic_handle, const T* X, T* Y, int N) { + const int threads = cuda::warp_size * cuda::max_warps_per_thread_block; + const int blocks = util::div_ceil( N, threads ); + cublasHandle_t handle = generic_handle.blas_handle_as(); + auto stream = util::get_stream(handle); + increment_vec_kernel<<>>(X,Y,N); +} + +template + void increment( device_blas_handle generic_handle, const double* X, double* Y, int N ); + template <> void dot( device_blas_handle generic_handle, int N, diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp index fdbb56bd..8f5d0560 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_extensions.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu index d54dda73..5e59ffcf 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_inc_potential.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu index 6c538eba..54d2486e 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -33,22 +37,23 @@ __global__ void modify_weights_ssf_kernel_1d( // Frisch partition functions auto gFrisch = [](double x) { - const double s_x = x / integrator::magic_ssf_factor<>; + const double s_x = x * 1.5625; // / integrator::magic_ssf_factor<>; const double s_x2 = s_x * s_x; const double s_x3 = s_x * s_x2; const double s_x5 = s_x3 * s_x2; const double s_x7 = s_x5 * s_x2; - return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; + //return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; + return ((35.)*(s_x - s_x3) + (21.)*s_x5 - (5.)*s_x7); }; auto sFrisch = [&] (double x) { - if( fabs(x) < integrator::magic_ssf_factor<> ) return 0.5 * (1. - gFrisch(x)); + if( fabs(x) < integrator::magic_ssf_factor<> ) return (0.5 - (0.5/16.) * gFrisch(x)); else if( x >= integrator::magic_ssf_factor<> ) return 0.; else return 1.; }; - constexpr double weight_tol = 1e-10; + constexpr double weight_tol = integrator::ssf_weight_tol; const int tid_x = threadIdx.x + blockIdx.x * blockDim.x; const int nt_x = blockDim.x * gridDim.x; @@ -100,7 +105,7 @@ __global__ void modify_weights_ssf_kernel_1d( const double ri = local_dist_scratch[ iCenter ]; - const double* const local_rab = RAB + iCenter * natoms; + const double* const local_rab = RAB + iCenter * ldRAB; double ps = 1.; for( int jCenter = 0; jCenter < natoms; jCenter++ ) @@ -138,4 +143,227 @@ void partition_weights_ssf_1d( int32_t npts, int32_t natoms, const double* RAB, } +__global__ void eval_weight_1st_deriv_contracted_ssf_kernel_1d( + size_t npts, + size_t natoms, + const double* RAB, + int32_t ldRAB, + const double* coords, + const double* points_x, + const double* points_y, + const double* points_z, + const double* dist_scratch, + int32_t lddist, + const int32_t* iparent_device, + const double* dist_nearest_device, + const double* __restrict__ w_times_f_device, + double* __restrict__ exc_grad_w_device +) { + + // Frisch partition functions + auto gFrisch = [](double x) { + + const double s_x = x * 1.5625; // / integrator::magic_ssf_factor<>; + const double s_x2 = s_x * s_x; + const double s_x3 = s_x * s_x2; + const double s_x5 = s_x3 * s_x2; + const double s_x7 = s_x5 * s_x2; + + return ((35.)*(s_x - s_x3) + (21.)*s_x5 - (5.)*s_x7); + }; + + auto sFrisch = [&] (double x) { + if( fabs(x) < integrator::magic_ssf_factor<> ) return (0.5 - (0.5/16.) * gFrisch(x)); + else if( x >= integrator::magic_ssf_factor<> ) return 0.; + else return 1.; + }; + + auto tFrisch = [&](double x) { + const double s_x = x * 1.5625; // / integrator::magic_ssf_factor<>; + const double s_x2 = s_x * s_x; + const double s_x3 = s_x * s_x2; + const double numerator = (35.) * (s_x3 + (3.) * s_x2 + (3.) * s_x + (1.)); + const double denominator = (x - integrator::magic_ssf_factor<>) * ((5.)*s_x3 + (20.)*s_x2 + (29.)*s_x + (16.)); + return numerator / denominator ; + }; + + constexpr double safe_magic_ssf_bound = integrator::magic_ssf_factor<> - 1e-4; + constexpr double weight_tol = integrator::ssf_weight_tol; + constexpr double w_times_f_thresh = 1.e-12; + + const int tid_x = threadIdx.x + blockIdx.x * blockDim.x; + const int nt_x = blockDim.x * gridDim.x; + + for( int ipt = tid_x; ipt < npts; ipt += nt_x ) { + + const auto w_times_f_i = w_times_f_device[ipt]; + if (fabs(w_times_f_i) < w_times_f_thresh) continue; // weight derivative = 0 when p_A = 0 + const auto iParent = iparent_device[ipt]; + + double sum = 0.; + double parent_weight = 0.; + + const double* const local_dist_scratch = dist_scratch + ipt * lddist; + const double dist_cutoff = 0.18 * dist_nearest_device[ipt]; // 0.5 * (1-integrator::magic_ssf_factor<>) * task.dist_nearest + if( local_dist_scratch[iParent] < dist_cutoff ) continue; //weight derivative = 0 when p_A = 1 + + // Do iParent First + { + const double ri = local_dist_scratch[ iParent ]; + const double* const local_rab = RAB + iParent * ldRAB; + + parent_weight = 1.; + for( int jCenter = 0; jCenter < natoms; jCenter++ ) + if( parent_weight > weight_tol ) { + if( iParent != jCenter ) { + + const double rj = local_dist_scratch[ jCenter ]; + + const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric + parent_weight *= sFrisch( mu ); + + } + } else break; + + sum += parent_weight; + } + + // caculate sum + for( int iCenter = 0; iCenter < natoms; iCenter++ ) + if ( iParent != iCenter ) { + const double ri = local_dist_scratch[ iCenter ]; + const double* const local_rab = RAB + iCenter * ldRAB; + double ps = 1.; + for( int jCenter = 0; jCenter < natoms; jCenter++ ) + if( ps > weight_tol ) { + if( iCenter != jCenter ) { + + const double rj = local_dist_scratch[ jCenter ]; + const double mu = (ri - rj) * local_rab[ jCenter ]; // XXX: RAB is symmetric + ps *= sFrisch( mu ); + } + } else break; + + sum += ps; + + } + + double sum_inv = 1. / sum; + + const double point_x = points_x[ipt]; + const double point_y = points_y[ipt]; + const double point_z = points_z[ipt]; + + // Now do derivative + for( int iB = 0; iB < natoms; iB++ ) + if( iParent != iB ) + { + double exc_grad_w_iBx = 0.0, exc_grad_w_iBy = 0.0, exc_grad_w_iBz = 0.0; + + const double* const local_Rinv_B = RAB + iB * ldRAB; + const double rB = local_dist_scratch[ iB ]; + const double coords_B_x = coords[3*iB + 0]; + const double coords_B_y = coords[3*iB + 1]; + const double coords_B_z = coords[3*iB + 2]; + + // first term + const double rA = local_dist_scratch[ iParent ]; + const double rAB_inv = local_Rinv_B[ iParent ]; + const double mu_AB = (rA - rB) * rAB_inv; + if( fabs(mu_AB) < safe_magic_ssf_bound) { + // first term is tFrisch(mu_AB) * (PA-Z)/Z * w_times_f_i * nabla_B mu_BA + double coef1 = tFrisch(mu_AB) * rAB_inv * (parent_weight - sum) * sum_inv * w_times_f_i / rB; + exc_grad_w_iBx = coef1 * (coords_B_x - point_x + mu_AB * ( coords_B_x - coords[3*iParent + 0]) * rAB_inv * rB); + exc_grad_w_iBy = coef1 * (coords_B_y - point_y + mu_AB * ( coords_B_y - coords[3*iParent + 1]) * rAB_inv * rB); + exc_grad_w_iBz = coef1 * (coords_B_z - point_z + mu_AB * ( coords_B_z - coords[3*iParent + 2]) * rAB_inv * rB); + } + + // second term and third term + // first need to calculate PB + double PB = 1.; + for( int jCenter = 0; jCenter < natoms; jCenter++ ) + if( PB > weight_tol ) { + if( iB != jCenter ) { + const double rj = local_dist_scratch[ jCenter ]; + const double mu = (rB - rj) * local_Rinv_B[ jCenter ]; + PB *= sFrisch( mu ); + } + } else break; + + if( PB > weight_tol ) + for( int iC = 0; iC < natoms; iC++ ) { + if (iB == iC) continue; + const double rBC_inv = local_Rinv_B[iC]; + const double rC = local_dist_scratch[iC]; + const double mu_BC = (rB - rC) * rBC_inv; + + if(fabs(mu_BC) < safe_magic_ssf_bound){ + const double t_BC = tFrisch(mu_BC); + const double coef = PB * t_BC * rBC_inv * sum_inv * w_times_f_i; + + const double coords_C_x = coords[3*iC + 0]; + const double coords_C_y = coords[3*iC + 1]; + const double coords_C_z = coords[3*iC + 2]; + + // second term + { + const double rB_inv = 1. / rB; + exc_grad_w_iBx -= coef * ((coords_B_x - point_x) * rB_inv - mu_BC * (coords_B_x - coords_C_x) * rBC_inv); + exc_grad_w_iBy -= coef * ((coords_B_y - point_y) * rB_inv - mu_BC * (coords_B_y - coords_C_y) * rBC_inv); + exc_grad_w_iBz -= coef * ((coords_B_z - point_z) * rB_inv - mu_BC * (coords_B_z - coords_C_z) * rBC_inv); + } + + if(iC != iParent) { + // third term + const double rC_inv = 1. / rC; + const double C_x = coef * ((coords_C_x - point_x) * rC_inv + mu_BC * (coords_C_x - coords_B_x) * rBC_inv); + const double C_y = coef * ((coords_C_y - point_y) * rC_inv + mu_BC * (coords_C_y - coords_B_y) * rBC_inv); + const double C_z = coef * ((coords_C_z - point_z) * rC_inv + mu_BC * (coords_C_z - coords_B_z) * rBC_inv); + + atomicAdd(exc_grad_w_device + 3*iC + 0, C_x); + atomicAdd(exc_grad_w_device + 3*iC + 1, C_y); + atomicAdd(exc_grad_w_device + 3*iC + 2, C_z); + + // Update parent atom + atomicAdd(exc_grad_w_device + 3*iParent + 0, -C_x); + atomicAdd(exc_grad_w_device + 3*iParent + 1, -C_y); + atomicAdd(exc_grad_w_device + 3*iParent + 2, -C_z); + } + } + } + + atomicAdd(exc_grad_w_device + 3*iB + 0, exc_grad_w_iBx); + atomicAdd(exc_grad_w_device + 3*iB + 1, exc_grad_w_iBy); + atomicAdd(exc_grad_w_device + 3*iB + 2, exc_grad_w_iBz); + + // Update parent atom + atomicAdd(exc_grad_w_device + 3*iParent + 0, -exc_grad_w_iBx); + atomicAdd(exc_grad_w_device + 3*iParent + 1, -exc_grad_w_iBy); + atomicAdd(exc_grad_w_device + 3*iParent + 2, -exc_grad_w_iBz); + + } + + } + +} + + + +void eval_weight_1st_deriv_contracted_ssf_1d( int32_t npts, int32_t natoms, const double* RAB, + int32_t ldRAB, const double* coords, + const double* points_x, const double* points_y, const double* points_z, + const double* dist, int32_t lddist, + const int32_t* iparent, const double* dist_nearest, const double* w_times_f, + double* exc_grad_w, cudaStream_t stream){ + + dim3 threads( cuda::max_threads_per_thread_block/4 ); + dim3 blocks ( util::div_ceil( npts, threads.x ) ); + eval_weight_1st_deriv_contracted_ssf_kernel_1d<<>>( + npts, natoms, RAB, ldRAB, coords, points_x, points_y, points_z, dist, lddist, iparent, dist_nearest, + w_times_f, exc_grad_w + ); + +} + + } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp index d7295527..bb9d3b74 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_1d.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -13,4 +17,11 @@ void partition_weights_ssf_1d( int32_t npts, int32_t natoms, const double* RAB, const int32_t* iparent, const double* dist_nearest, double* weights, cudaStream_t stream); +void eval_weight_1st_deriv_contracted_ssf_1d( int32_t npts, int32_t natoms, const double* RAB, + int32_t ldRAB, const double* coords, + const double* points_x, const double* points_y, const double* points_z, + const double* dist, int32_t lddist, + const int32_t* iparent, const double* dist_nearest, const double* w_times_f, + double* exc_grad_w, cudaStream_t stream); + } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu index 36f439ee..b792cd0c 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cuda_ssf_2d.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -48,7 +52,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, //constexpr uint32_t warps_per_thread_block = weight_thread_block / cuda::warp_size; static_assert( weight_unroll == 4 ); - constexpr double weight_tol = 1e-10; + constexpr double weight_tol = integrator::ssf_weight_tol; int natom_block = ((natoms + blockDim.x - 1) / blockDim.x) * blockDim.x; const int tid_x = threadIdx.y + blockIdx.y * blockDim.y; @@ -87,6 +91,7 @@ void modify_weights_ssf_kernel_2d( int32_t npts, int32_t natoms, contribution = sFrisch( mu ); } contribution = cuda::warp_reduce_prod(contribution); + contribution = __shfl_sync(0xFFFFFFFF, contribution, 0); parent_weight *= contribution; diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu index 0eae8d71..722d8c05 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp index 8807b70f..d9fa216b 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/cutlass_wrapper.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu index a4a033f6..86799ad2 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/exx_ek_screening_bfn_stats.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu index 652dc537..03600010 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp index 9fbd2b3f..fc1a9d44 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/grid_to_center.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu index 4dd328bd..9470c1c7 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/increment_exc_grad.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -13,7 +17,8 @@ namespace GauXC { -__global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel( +template +__global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_rks_kernel( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks, @@ -25,6 +30,7 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel( const auto shell = shell_to_task[ish].shell_device; const auto task_idx = shell_to_task[ish].task_idx_device; const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + const int iCen = shell_to_task[ish].center_idx; const uint32_t shsz = shell->size(); const int global_warp_id = @@ -37,6 +43,12 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel( const auto* task = device_tasks + task_idx[itask]; const uint32_t npts = task->npts; const size_t shoff = task_shell_offs[itask] * npts; + const int iParent = task->iParent; + if constexpr( with_weight_derivatives ) { + if( iCen == iParent ) + continue; + } + double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0); const auto* __restrict__ basis_x_eval = task->dbfx + shoff; const auto* __restrict__ basis_y_eval = task->dbfy + shoff; @@ -57,13 +69,24 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel( const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts]; const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts]; - g_acc_x += z_mu_i * dbfx_mu_i; - g_acc_y += z_mu_i * dbfy_mu_i; - g_acc_z += z_mu_i * dbfz_mu_i; + g_acc_x_task += z_mu_i * dbfx_mu_i; + g_acc_y_task += z_mu_i * dbfy_mu_i; + g_acc_z_task += z_mu_i * dbfz_mu_i; } // Loop over bfns within a shell } // Loop over points + g_acc_x += g_acc_x_task; + g_acc_y += g_acc_y_task; + g_acc_z += g_acc_z_task; + + //write to Parent atom with translational invariance + if constexpr( with_weight_derivatives ) { + atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task ); + atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task ); + atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task ); + } + } // Loop over tasks assigned to shell constexpr auto warp_size = cuda::warp_size; @@ -72,7 +95,6 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel( g_acc_z = -2. * cuda::warp_reduce_sum( g_acc_z ); if( (threadIdx.x % cuda::warp_size) == 0 ) { - const int iCen = shell_to_task[ish].center_idx; atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x ); atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y ); @@ -83,8 +105,105 @@ __global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_kernel( } -void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task, - XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue queue ) { +template +__global__ __launch_bounds__(1024,1) void increment_exc_grad_lda_uks_kernel( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks, + double* __restrict__ EXC_GRAD +) { + + for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + const int iCen = shell_to_task[ish].center_idx; + const uint32_t shsz = shell->size(); + + const int global_warp_id = + (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + double g_acc_x(0), g_acc_y(0), g_acc_z(0); + for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + const int iParent = task->iParent; + if constexpr( with_weight_derivatives ) { + if( iCen == iParent ) + continue; + } + double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0); + + const auto* __restrict__ basis_x_eval = task->dbfx + shoff; + const auto* __restrict__ basis_y_eval = task->dbfy + shoff; + const auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + const auto* __restrict__ xmatS = task->xmatS + shoff; + const auto* __restrict__ xmatZ = task->xmatZ + shoff; + const auto* __restrict__ vrhop = task->vrho_pos; + const auto* __restrict__ vrhom = task->vrho_neg; + + #pragma unroll 1 + for( uint32_t ipt = threadIdx.x % cuda::warp_size; + ipt < npts; + ipt += cuda::warp_size ) { + + const double vrhop_i = vrhop[ipt]; + const double vrhom_i = vrhom[ipt]; + + const auto vrhoS_i = 0.5 * (vrhop_i + vrhom_i); + const auto vrhoZ_i = 0.5 * (vrhop_i - vrhom_i); + for( uint32_t ibf = 0; ibf < shsz; ++ibf ) { + const double zS_mu_i = vrhoS_i * xmatS[ipt + ibf*npts]; + const double zZ_mu_i = vrhoZ_i * xmatZ[ipt + ibf*npts]; + const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts]; + const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts]; + const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts]; + + g_acc_x_task += zS_mu_i * dbfx_mu_i; + g_acc_y_task += zS_mu_i * dbfy_mu_i; + g_acc_z_task += zS_mu_i * dbfz_mu_i; + g_acc_x_task += zZ_mu_i * dbfx_mu_i; + g_acc_y_task += zZ_mu_i * dbfy_mu_i; + g_acc_z_task += zZ_mu_i * dbfz_mu_i; + } // Loop over bfns within a shell + + } // Loop over points + + g_acc_x += g_acc_x_task; + g_acc_y += g_acc_y_task; + g_acc_z += g_acc_z_task; + + //write to Parent atom with translational invariance + if constexpr( with_weight_derivatives ) { + atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task ); + atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task ); + atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task ); + } + + } // Loop over tasks assigned to shell + + constexpr auto warp_size = cuda::warp_size; + g_acc_x = -2. * cuda::warp_reduce_sum( g_acc_x ); + g_acc_y = -2. * cuda::warp_reduce_sum( g_acc_y ); + g_acc_z = -2. * cuda::warp_reduce_sum( g_acc_z ); + + if( (threadIdx.x % cuda::warp_size) == 0 ) { + atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x ); + atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y ); + atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z ); + } + + } // Loop over shells + +} + +void increment_exc_grad_lda( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task, + XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue queue ) { cudaStream_t stream = queue.queue_as(); #if 0 @@ -98,9 +217,31 @@ void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task, dim3 threads(1024), blocks(1,1,nshell); #endif - increment_exc_grad_lda_kernel<<>>( - nshell, shell_to_task, device_tasks, EXC_GRAD - ); + switch(ks_scheme) { + case RKS: + if (with_weight_derivatives) { + increment_exc_grad_lda_rks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } else { + increment_exc_grad_lda_rks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } + break; + case UKS: + if (with_weight_derivatives) { + increment_exc_grad_lda_uks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } else { + increment_exc_grad_lda_uks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } + break; + default: GAUXC_GENERIC_EXCEPTION("LDA EXC GRAD + GKS NYI"); + } } @@ -117,11 +258,138 @@ void increment_exc_grad_lda( size_t nshell, ShellToTaskDevice* shell_to_task, +template +__global__ __launch_bounds__(512,1) void increment_exc_grad_gga_rks_kernel( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks, + double* __restrict__ EXC_GRAD +) { + + for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + const int iCen = shell_to_task[ish].center_idx; + const uint32_t shsz = shell->size(); + + const int global_warp_id = + (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + double g_acc_x(0), g_acc_y(0), g_acc_z(0); + for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + const int iParent = task->iParent; + if constexpr( with_weight_derivatives ) { + if( iCen == iParent ) + continue; + } + double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0); + + const auto* __restrict__ basis_x_eval = task->dbfx + shoff; + const auto* __restrict__ basis_y_eval = task->dbfy + shoff; + const auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + const auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + const auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + const auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + const auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + const auto* __restrict__ xmat = task->zmat + shoff; + const auto* __restrict__ xmat_x = task->xmat_x + shoff; + const auto* __restrict__ xmat_y = task->xmat_y + shoff; + const auto* __restrict__ xmat_z = task->xmat_z + shoff; + + const auto* __restrict__ vrho = task->vrho; + const auto* __restrict__ vgamma = task->vgamma; + + const auto* __restrict__ den_x = task->dden_sx; + const auto* __restrict__ den_y = task->dden_sy; + const auto* __restrict__ den_z = task->dden_sz; + + #pragma unroll 1 + for( uint32_t ipt = threadIdx.x % cuda::warp_size; + ipt < npts; + ipt += cuda::warp_size ) { + + const double vrho_i = vrho[ipt]; + const double vgamma_i = vgamma[ipt]; + + const double denx_i = den_x[ipt]; + const double deny_i = den_y[ipt]; + const double denz_i = den_z[ipt]; + for( uint32_t ibf = 0; ibf < shsz; ++ibf ) { + const double z_mu_i = xmat[ipt + ibf*npts]; + const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts]; + const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts]; + const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts]; + + g_acc_x_task += vrho_i * z_mu_i * dbfx_mu_i; + g_acc_y_task += vrho_i * z_mu_i * dbfy_mu_i; + g_acc_z_task += vrho_i * z_mu_i * dbfz_mu_i; + + const double zx = xmat_x[ipt + ibf*npts]; + const double zy = xmat_y[ipt + ibf*npts]; + const double zz = xmat_z[ipt + ibf*npts]; + + const double d11_xmat_term = denx_i * zx + deny_i * zy + denz_i * zz; + + const double d2bfxx = basis_xx_eval[ipt + ibf*npts]; + const double d2bfxy = basis_xy_eval[ipt + ibf*npts]; + const double d2bfxz = basis_xz_eval[ipt + ibf*npts]; + const double d2bfyy = basis_yy_eval[ipt + ibf*npts]; + const double d2bfyz = basis_yz_eval[ipt + ibf*npts]; + const double d2bfzz = basis_zz_eval[ipt + ibf*npts]; + + const double d2_term_x = d2bfxx*denx_i + d2bfxy*deny_i + d2bfxz*denz_i; + const double d2_term_y = d2bfxy*denx_i + d2bfyy*deny_i + d2bfyz*denz_i; + const double d2_term_z = d2bfxz*denx_i + d2bfyz*deny_i + d2bfzz*denz_i; + + g_acc_x_task += 2 * vgamma_i * ( z_mu_i * d2_term_x + dbfx_mu_i * d11_xmat_term ); + g_acc_y_task += 2 * vgamma_i * ( z_mu_i * d2_term_y + dbfy_mu_i * d11_xmat_term ); + g_acc_z_task += 2 * vgamma_i * ( z_mu_i * d2_term_z + dbfz_mu_i * d11_xmat_term ); + + } // Loop over bfns within a shell + + } // Loop over points + + g_acc_x += g_acc_x_task; + g_acc_y += g_acc_y_task; + g_acc_z += g_acc_z_task; + + //write to Parent atom with translational invariance + if constexpr( with_weight_derivatives ) { + atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task ); + atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task ); + atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task ); + } + + } // Loop over tasks assigned to shell + + constexpr auto warp_size = cuda::warp_size; + g_acc_x = -2. * cuda::warp_reduce_sum( g_acc_x ); + g_acc_y = -2. * cuda::warp_reduce_sum( g_acc_y ); + g_acc_z = -2. * cuda::warp_reduce_sum( g_acc_z ); + if( (threadIdx.x % cuda::warp_size) == 0 ) { + atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x ); + atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y ); + atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z ); + } + } // Loop over shells +} -__global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( +template +__global__ __launch_bounds__(512,1) void increment_exc_grad_gga_uks_kernel( uint32_t nshell, ShellToTaskDevice* __restrict__ shell_to_task, XCDeviceTask* __restrict__ device_tasks, @@ -133,6 +401,7 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const auto shell = shell_to_task[ish].shell_device; const auto task_idx = shell_to_task[ish].task_idx_device; const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + const int iCen = shell_to_task[ish].center_idx; const uint32_t shsz = shell->size(); const int global_warp_id = @@ -145,6 +414,12 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const auto* task = device_tasks + task_idx[itask]; const uint32_t npts = task->npts; const size_t shoff = task_shell_offs[itask] * npts; + const int iParent = task->iParent; + if constexpr( with_weight_derivatives ) { + if( iCen == iParent ) + continue; + } + double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0); const auto* __restrict__ basis_x_eval = task->dbfx + shoff; const auto* __restrict__ basis_y_eval = task->dbfy + shoff; @@ -157,13 +432,229 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; - const auto* __restrict__ xmat = task->zmat + shoff; + const auto* __restrict__ xmatS = task->xmatS + shoff; + const auto* __restrict__ xmatS_x = task->xmatS_x + shoff; + const auto* __restrict__ xmatS_y = task->xmatS_y + shoff; + const auto* __restrict__ xmatS_z = task->xmatS_z + shoff; + + const auto* __restrict__ xmatZ = task->xmatZ + shoff; + const auto* __restrict__ xmatZ_x = task->xmatZ_x + shoff; + const auto* __restrict__ xmatZ_y = task->xmatZ_y + shoff; + const auto* __restrict__ xmatZ_z = task->xmatZ_z + shoff; + + const auto* __restrict__ vrhop = task->vrho_pos; + const auto* __restrict__ vrhom = task->vrho_neg; + + const auto* __restrict__ vgamma_pp = task->vgamma_pp; + const auto* __restrict__ vgamma_pm = task->vgamma_pm; + const auto* __restrict__ vgamma_mm = task->vgamma_mm; + + const auto* __restrict__ dens_x = task->dden_sx; + const auto* __restrict__ dens_y = task->dden_sy; + const auto* __restrict__ dens_z = task->dden_sz; + + const auto* __restrict__ denz_x = task->dden_zx; + const auto* __restrict__ denz_y = task->dden_zy; + const auto* __restrict__ denz_z = task->dden_zz; + + #pragma unroll 1 + for( uint32_t ipt = threadIdx.x % cuda::warp_size; + ipt < npts; + ipt += cuda::warp_size ) { + + const double vrhop_i = vrhop[ipt]; + const double vrhom_i = vrhom[ipt]; + const double vrhoS_i = 0.5 * (vrhop_i + vrhom_i); + const double vrhoZ_i = 0.5 * (vrhop_i - vrhom_i); + + const double vgammapp_i = vgamma_pp[ipt]; + const double vgammapm_i = vgamma_pm[ipt]; + const double vgammamm_i = vgamma_mm[ipt]; + + const double denSx_i = dens_x[ipt]; + const double denSy_i = dens_y[ipt]; + const double denSz_i = dens_z[ipt]; + const double denZx_i = denz_x[ipt]; + const double denZy_i = denz_y[ipt]; + const double denZz_i = denz_z[ipt]; + + for( uint32_t ibf = 0; ibf < shsz; ++ibf ) { + const double xN = xmatS[ipt + ibf*npts]; + const double xZ = xmatZ[ipt + ibf*npts]; + const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts]; + const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts]; + const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts]; + + g_acc_x_task += vrhoS_i * xN * dbfx_mu_i; + g_acc_y_task += vrhoS_i * xN * dbfy_mu_i; + g_acc_z_task += vrhoS_i * xN * dbfz_mu_i; + g_acc_x_task += vrhoZ_i * xZ * dbfx_mu_i; + g_acc_y_task += vrhoZ_i * xZ * dbfy_mu_i; + g_acc_z_task += vrhoZ_i * xZ * dbfz_mu_i; + + const double xNx = xmatS_x[ipt + ibf*npts]; + const double xNy = xmatS_y[ipt + ibf*npts]; + const double xNz = xmatS_z[ipt + ibf*npts]; + const double xZx = xmatZ_x[ipt + ibf*npts]; + const double xZy = xmatZ_y[ipt + ibf*npts]; + const double xZz = xmatZ_z[ipt + ibf*npts]; + + const double d11nn_xmat_term = denSx_i * xNx + denSy_i * xNy + denSz_i * xNz; + const double d11nz_xmat_term = denSx_i * xZx + denSy_i * xZy + denSz_i * xZz; + const double d11zn_xmat_term = denZx_i * xNx + denZy_i * xNy + denZz_i * xNz; + const double d11zz_xmat_term = denZx_i * xZx + denZy_i * xZy + denZz_i * xZz; + + const double d2bfxx = basis_xx_eval[ipt + ibf*npts]; + const double d2bfxy = basis_xy_eval[ipt + ibf*npts]; + const double d2bfxz = basis_xz_eval[ipt + ibf*npts]; + const double d2bfyy = basis_yy_eval[ipt + ibf*npts]; + const double d2bfyz = basis_yz_eval[ipt + ibf*npts]; + const double d2bfzz = basis_zz_eval[ipt + ibf*npts]; + + const double d2n_term_x = d2bfxx*denSx_i + d2bfxy*denSy_i + d2bfxz*denSz_i; + const double d2n_term_y = d2bfxy*denSx_i + d2bfyy*denSy_i + d2bfyz*denSz_i; + const double d2n_term_z = d2bfxz*denSx_i + d2bfyz*denSy_i + d2bfzz*denSz_i; + const double d2z_term_x = d2bfxx*denZx_i + d2bfxy*denZy_i + d2bfxz*denZz_i; + const double d2z_term_y = d2bfxy*denZx_i + d2bfyy*denZy_i + d2bfyz*denZz_i; + const double d2z_term_z = d2bfxz*denZx_i + d2bfyz*denZy_i + d2bfzz*denZz_i; + + g_acc_x_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_x * xN + d11nn_xmat_term * dbfx_mu_i); + g_acc_x_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2z_term_x * xN + d11zn_xmat_term * dbfx_mu_i); + g_acc_x_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2n_term_x * xZ + d11nz_xmat_term * dbfx_mu_i); + g_acc_x_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_x * xZ + d11zz_xmat_term * dbfx_mu_i); + + g_acc_y_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_y * xN + d11nn_xmat_term * dbfy_mu_i); + g_acc_y_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2z_term_y * xN + d11zn_xmat_term * dbfy_mu_i); + g_acc_y_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2n_term_y * xZ + d11nz_xmat_term * dbfy_mu_i); + g_acc_y_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_y * xZ + d11zz_xmat_term * dbfy_mu_i); + + g_acc_z_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_z * xN + d11nn_xmat_term * dbfz_mu_i); + g_acc_z_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2z_term_z * xN + d11zn_xmat_term * dbfz_mu_i); + g_acc_z_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2n_term_z * xZ + d11nz_xmat_term * dbfz_mu_i); + g_acc_z_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_z * xZ + d11zz_xmat_term * dbfz_mu_i); + + }// Loop over bfns within a shell + + } // Loop over points + + g_acc_x += g_acc_x_task; + g_acc_y += g_acc_y_task; + g_acc_z += g_acc_z_task; + + //write to Parent atom with translational invariance + if constexpr( with_weight_derivatives ) { + atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task ); + atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task ); + atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task ); + } + + } // Loop over tasks assigned to shell + + constexpr auto warp_size = cuda::warp_size; + g_acc_x = -2. * cuda::warp_reduce_sum( g_acc_x ); + g_acc_y = -2. * cuda::warp_reduce_sum( g_acc_y ); + g_acc_z = -2. * cuda::warp_reduce_sum( g_acc_z ); + + if( (threadIdx.x % cuda::warp_size) == 0 ) { + atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x ); + atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y ); + atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z ); + } + + } // Loop over shells + +} + +void increment_exc_grad_gga( integrator_ks_scheme ks_scheme, size_t nshell, ShellToTaskDevice* shell_to_task, + XCDeviceTask* device_tasks, double* EXC_GRAD, bool with_weight_derivatives, device_queue queue ) { + + cudaStream_t stream = queue.queue_as(); + dim3 threads(512), blocks(1,1,nshell); + + switch(ks_scheme) { + case RKS: + if (with_weight_derivatives) { + increment_exc_grad_gga_rks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } else { + increment_exc_grad_gga_rks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } + break; + case UKS: + if (with_weight_derivatives) { + increment_exc_grad_gga_uks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } else { + increment_exc_grad_gga_uks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } + break; + default: GAUXC_GENERIC_EXCEPTION("GGA EXC GRAD + GKS NYI"); + } +} + + + + + + +template +__global__ __launch_bounds__(512,1) void increment_exc_grad_mgga_rks_kernel( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks, + double* __restrict__ EXC_GRAD +) { + + for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + const int iCen = shell_to_task[ish].center_idx; + const uint32_t shsz = shell->size(); + + const int global_warp_id = + (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + double g_acc_x(0), g_acc_y(0), g_acc_z(0); + for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + + const auto* task = device_tasks + task_idx[itask]; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + const int iParent = task->iParent; + if constexpr( with_weight_derivatives ) { + if( iCen == iParent ) + continue; + } + double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0); + + const auto* __restrict__ basis_x_eval = task->dbfx + shoff; + const auto* __restrict__ basis_y_eval = task->dbfy + shoff; + const auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + const auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + const auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + const auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + const auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + const auto* __restrict__ xmat = task->zmat + shoff; const auto* __restrict__ xmat_x = task->xmat_x + shoff; const auto* __restrict__ xmat_y = task->xmat_y + shoff; const auto* __restrict__ xmat_z = task->xmat_z + shoff; - const auto* __restrict__ vrho = task->vrho; + const auto* __restrict__ vrho = task->vrho; const auto* __restrict__ vgamma = task->vgamma; + const auto* __restrict__ vtau = task->vtau; const auto* __restrict__ den_x = task->dden_sx; const auto* __restrict__ den_y = task->dden_sy; @@ -176,6 +667,7 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const double vrho_i = vrho[ipt]; const double vgamma_i = vgamma[ipt]; + const double vtau_i = 0.5 * vtau[ipt]; const double denx_i = den_x[ipt]; const double deny_i = den_y[ipt]; @@ -186,15 +678,15 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts]; const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts]; - g_acc_x += vrho_i * z_mu_i * dbfx_mu_i; - g_acc_y += vrho_i * z_mu_i * dbfy_mu_i; - g_acc_z += vrho_i * z_mu_i * dbfz_mu_i; + g_acc_x_task += vrho_i * z_mu_i * dbfx_mu_i; + g_acc_y_task += vrho_i * z_mu_i * dbfy_mu_i; + g_acc_z_task += vrho_i * z_mu_i * dbfz_mu_i; const double zx = xmat_x[ipt + ibf*npts]; const double zy = xmat_y[ipt + ibf*npts]; const double zz = xmat_z[ipt + ibf*npts]; - const double d11_xmat_term = denx_i * zx + deny_i * zy + denz_i * zz; + const double d11_xmat_term = denx_i * zx + deny_i * zy + denz_i * zz; const double d2bfxx = basis_xx_eval[ipt + ibf*npts]; const double d2bfxy = basis_xy_eval[ipt + ibf*npts]; @@ -203,18 +695,41 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( const double d2bfyz = basis_yz_eval[ipt + ibf*npts]; const double d2bfzz = basis_zz_eval[ipt + ibf*npts]; - const double d2_term_x = d2bfxx*denx_i + d2bfxy*deny_i + d2bfxz*denz_i; - const double d2_term_y = d2bfxy*denx_i + d2bfyy*deny_i + d2bfyz*denz_i; - const double d2_term_z = d2bfxz*denx_i + d2bfyz*deny_i + d2bfzz*denz_i; + { + const double d2_term_x = d2bfxx*denx_i + d2bfxy*deny_i + d2bfxz*denz_i; + const double d2_term_y = d2bfxy*denx_i + d2bfyy*deny_i + d2bfyz*denz_i; + const double d2_term_z = d2bfxz*denx_i + d2bfyz*deny_i + d2bfzz*denz_i; + + g_acc_x_task += 2 * vgamma_i * ( z_mu_i * d2_term_x + dbfx_mu_i * d11_xmat_term ); + g_acc_y_task += 2 * vgamma_i * ( z_mu_i * d2_term_y + dbfy_mu_i * d11_xmat_term ); + g_acc_z_task += 2 * vgamma_i * ( z_mu_i * d2_term_z + dbfz_mu_i * d11_xmat_term ); + } + + { + const double d2_term_x = d2bfxx*zx + d2bfxy*zy + d2bfxz*zz; + const double d2_term_y = d2bfxy*zx + d2bfyy*zy + d2bfyz*zz; + const double d2_term_z = d2bfxz*zx + d2bfyz*zy + d2bfzz*zz; - g_acc_x += 2 * vgamma_i * ( z_mu_i * d2_term_x + dbfx_mu_i * d11_xmat_term ); - g_acc_y += 2 * vgamma_i * ( z_mu_i * d2_term_y + dbfy_mu_i * d11_xmat_term ); - g_acc_z += 2 * vgamma_i * ( z_mu_i * d2_term_z + dbfz_mu_i * d11_xmat_term ); + g_acc_x_task += vtau_i * d2_term_x; + g_acc_y_task += vtau_i * d2_term_y; + g_acc_z_task += vtau_i * d2_term_z; + } } // Loop over bfns within a shell } // Loop over points + g_acc_x += g_acc_x_task; + g_acc_y += g_acc_y_task; + g_acc_z += g_acc_z_task; + + //write to Parent atom with translational invariance + if constexpr( with_weight_derivatives ) { + atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task ); + atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task ); + atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task ); + } + } // Loop over tasks assigned to shell constexpr auto warp_size = cuda::warp_size; @@ -223,8 +738,206 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( g_acc_z = -2. * cuda::warp_reduce_sum( g_acc_z ); if( (threadIdx.x % cuda::warp_size) == 0 ) { - const int iCen = shell_to_task[ish].center_idx; + atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x ); + atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y ); + atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z ); + } + + } // Loop over shells + +} + +template +__global__ __launch_bounds__(512,1) void increment_exc_grad_mgga_uks_kernel( + uint32_t nshell, + ShellToTaskDevice* __restrict__ shell_to_task, + XCDeviceTask* __restrict__ device_tasks, + double* __restrict__ EXC_GRAD +) { + for( uint32_t ish = blockIdx.z; ish < nshell; ish += gridDim.z ) { + const uint32_t ntasks = shell_to_task[ish].ntask; + const auto shell = shell_to_task[ish].shell_device; + const auto task_idx = shell_to_task[ish].task_idx_device; + const auto task_shell_offs = shell_to_task[ish].task_shell_offs_device; + const int iCen = shell_to_task[ish].center_idx; + const uint32_t shsz = shell->size(); + + const int global_warp_id = + (threadIdx.x + blockIdx.x*blockDim.x) / cuda::warp_size; + const int nwarp_global = max((blockDim.x*gridDim.x) / cuda::warp_size,1); + + double g_acc_x(0), g_acc_y(0), g_acc_z(0); + for( uint32_t itask = global_warp_id; itask < ntasks; itask += nwarp_global ) { + const auto* task = device_tasks + task_idx[itask]; + const uint32_t npts = task->npts; + const size_t shoff = task_shell_offs[itask] * npts; + const int iParent = task->iParent; + if constexpr( with_weight_derivatives ) { + if( iCen == iParent ) + continue; + } + double g_acc_x_task(0), g_acc_y_task(0), g_acc_z_task(0); + + const auto* __restrict__ basis_x_eval = task->dbfx + shoff; + const auto* __restrict__ basis_y_eval = task->dbfy + shoff; + const auto* __restrict__ basis_z_eval = task->dbfz + shoff; + + const auto* __restrict__ basis_xx_eval = task->d2bfxx + shoff; + const auto* __restrict__ basis_xy_eval = task->d2bfxy + shoff; + const auto* __restrict__ basis_xz_eval = task->d2bfxz + shoff; + const auto* __restrict__ basis_yy_eval = task->d2bfyy + shoff; + const auto* __restrict__ basis_yz_eval = task->d2bfyz + shoff; + const auto* __restrict__ basis_zz_eval = task->d2bfzz + shoff; + + const auto* __restrict__ xmatS = task->xmatS + shoff; + const auto* __restrict__ xmatS_x = task->xmatS_x + shoff; + const auto* __restrict__ xmatS_y = task->xmatS_y + shoff; + const auto* __restrict__ xmatS_z = task->xmatS_z + shoff; + + const auto* __restrict__ xmatZ = task->xmatZ + shoff; + const auto* __restrict__ xmatZ_x = task->xmatZ_x + shoff; + const auto* __restrict__ xmatZ_y = task->xmatZ_y + shoff; + const auto* __restrict__ xmatZ_z = task->xmatZ_z + shoff; + + const auto* __restrict__ vrhop = task->vrho_pos; + const auto* __restrict__ vrhom = task->vrho_neg; + const auto* __restrict__ vtaup = task->vtau_pos; + const auto* __restrict__ vtaum = task->vtau_neg; + + const auto* __restrict__ vgamma_pp = task->vgamma_pp; + const auto* __restrict__ vgamma_pm = task->vgamma_pm; + const auto* __restrict__ vgamma_mm = task->vgamma_mm; + + const auto* __restrict__ dens_x = task->dden_sx; + const auto* __restrict__ dens_y = task->dden_sy; + const auto* __restrict__ dens_z = task->dden_sz; + + const auto* __restrict__ denz_x = task->dden_zx; + const auto* __restrict__ denz_y = task->dden_zy; + const auto* __restrict__ denz_z = task->dden_zz; + + #pragma unroll 1 + for( uint32_t ipt = threadIdx.x % cuda::warp_size; + ipt < npts; + ipt += cuda::warp_size ) { + + const double vrhop_i = vrhop[ipt]; + const double vrhom_i = vrhom[ipt]; + const double vrhoS_i = 0.5 * (vrhop_i + vrhom_i); + const double vrhoZ_i = 0.5 * (vrhop_i - vrhom_i); + + const double vtaup_i = 0.5 * vtaup[ipt]; + const double vtaum_i = 0.5 * vtaum[ipt]; + const double vtauS_i = 0.5 * (vtaup_i + vtaum_i); + const double vtauZ_i = 0.5 * (vtaup_i - vtaum_i); + + const double vgammapp_i = vgamma_pp[ipt]; + const double vgammapm_i = vgamma_pm[ipt]; + const double vgammamm_i = vgamma_mm[ipt]; + + const double denSx_i = dens_x[ipt]; + const double denSy_i = dens_y[ipt]; + const double denSz_i = dens_z[ipt]; + const double denZx_i = denz_x[ipt]; + const double denZy_i = denz_y[ipt]; + const double denZz_i = denz_z[ipt]; + + for( uint32_t ibf = 0; ibf < shsz; ++ibf ) { + const double xN = xmatS[ipt + ibf*npts]; + const double xZ = xmatZ[ipt + ibf*npts]; + const double dbfx_mu_i = basis_x_eval[ipt + ibf*npts]; + const double dbfy_mu_i = basis_y_eval[ipt + ibf*npts]; + const double dbfz_mu_i = basis_z_eval[ipt + ibf*npts]; + + g_acc_x_task += vrhoS_i * xN * dbfx_mu_i; + g_acc_y_task += vrhoS_i * xN * dbfy_mu_i; + g_acc_z_task += vrhoS_i * xN * dbfz_mu_i; + g_acc_x_task += vrhoZ_i * xZ * dbfx_mu_i; + g_acc_y_task += vrhoZ_i * xZ * dbfy_mu_i; + g_acc_z_task += vrhoZ_i * xZ * dbfz_mu_i; + + const double xNx = xmatS_x[ipt + ibf*npts]; + const double xNy = xmatS_y[ipt + ibf*npts]; + const double xNz = xmatS_z[ipt + ibf*npts]; + const double xZx = xmatZ_x[ipt + ibf*npts]; + const double xZy = xmatZ_y[ipt + ibf*npts]; + const double xZz = xmatZ_z[ipt + ibf*npts]; + + const double d11nn_xmat_term = denSx_i * xNx + denSy_i * xNy + denSz_i * xNz; + const double d11nz_xmat_term = denSx_i * xZx + denSy_i * xZy + denSz_i * xZz; + const double d11zn_xmat_term = denZx_i * xNx + denZy_i * xNy + denZz_i * xNz; + const double d11zz_xmat_term = denZx_i * xZx + denZy_i * xZy + denZz_i * xZz; + + const double d2bfxx = basis_xx_eval[ipt + ibf*npts]; + const double d2bfxy = basis_xy_eval[ipt + ibf*npts]; + const double d2bfxz = basis_xz_eval[ipt + ibf*npts]; + const double d2bfyy = basis_yy_eval[ipt + ibf*npts]; + const double d2bfyz = basis_yz_eval[ipt + ibf*npts]; + const double d2bfzz = basis_zz_eval[ipt + ibf*npts]; + + { + const double d2n_term_x = d2bfxx*denSx_i + d2bfxy*denSy_i + d2bfxz*denSz_i; + const double d2n_term_y = d2bfxy*denSx_i + d2bfyy*denSy_i + d2bfyz*denSz_i; + const double d2n_term_z = d2bfxz*denSx_i + d2bfyz*denSy_i + d2bfzz*denSz_i; + const double d2z_term_x = d2bfxx*denZx_i + d2bfxy*denZy_i + d2bfxz*denZz_i; + const double d2z_term_y = d2bfxy*denZx_i + d2bfyy*denZy_i + d2bfyz*denZz_i; + const double d2z_term_z = d2bfxz*denZx_i + d2bfyz*denZy_i + d2bfzz*denZz_i; + + g_acc_x_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_x * xN + d11nn_xmat_term * dbfx_mu_i); + g_acc_x_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2z_term_x * xN + d11zn_xmat_term * dbfx_mu_i); + g_acc_x_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2n_term_x * xZ + d11nz_xmat_term * dbfx_mu_i); + g_acc_x_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_x * xZ + d11zz_xmat_term * dbfx_mu_i); + + g_acc_y_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_y * xN + d11nn_xmat_term * dbfy_mu_i); + g_acc_y_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2z_term_y * xN + d11zn_xmat_term * dbfy_mu_i); + g_acc_y_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2n_term_y * xZ + d11nz_xmat_term * dbfy_mu_i); + g_acc_y_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_y * xZ + d11zz_xmat_term * dbfy_mu_i); + + g_acc_z_task += 0.5 * (vgammapp_i + vgammapm_i + vgammamm_i) * (d2n_term_z * xN + d11nn_xmat_term * dbfz_mu_i); + g_acc_z_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2z_term_z * xN + d11zn_xmat_term * dbfz_mu_i); + g_acc_z_task += 0.5 * (vgammapp_i - vgammamm_i) * (d2n_term_z * xZ + d11nz_xmat_term * dbfz_mu_i); + g_acc_z_task += 0.5 * (vgammapp_i - vgammapm_i + vgammamm_i) * (d2z_term_z * xZ + d11zz_xmat_term * dbfz_mu_i); + } + + { + const double d2n_term_x = d2bfxx*xNx + d2bfxy*xNy + d2bfxz*xNz; + const double d2n_term_y = d2bfxy*xNx + d2bfyy*xNy + d2bfyz*xNz; + const double d2n_term_z = d2bfxz*xNx + d2bfyz*xNy + d2bfzz*xNz; + const double d2z_term_x = d2bfxx*xZx + d2bfxy*xZy + d2bfxz*xZz; + const double d2z_term_y = d2bfxy*xZx + d2bfyy*xZy + d2bfyz*xZz; + const double d2z_term_z = d2bfxz*xZx + d2bfyz*xZy + d2bfzz*xZz; + g_acc_x_task += vtauS_i * d2n_term_x; + g_acc_y_task += vtauS_i * d2n_term_y; + g_acc_z_task += vtauS_i * d2n_term_z; + + g_acc_x_task += vtauZ_i * d2z_term_x; + g_acc_y_task += vtauZ_i * d2z_term_y; + g_acc_z_task += vtauZ_i * d2z_term_z; + } + }// Loop over bfns within a shell + + } // Loop over points + + g_acc_x += g_acc_x_task; + g_acc_y += g_acc_y_task; + g_acc_z += g_acc_z_task; + + //write to Parent atom with translational invariance + if constexpr( with_weight_derivatives ) { + atomicAdd( EXC_GRAD + 3*iParent + 0, 2.0 * g_acc_x_task ); + atomicAdd( EXC_GRAD + 3*iParent + 1, 2.0 * g_acc_y_task ); + atomicAdd( EXC_GRAD + 3*iParent + 2, 2.0 * g_acc_z_task ); + } + + } // Loop over tasks assigned to shell + + constexpr auto warp_size = cuda::warp_size; + g_acc_x = -2. * cuda::warp_reduce_sum( g_acc_x ); + g_acc_y = -2. * cuda::warp_reduce_sum( g_acc_y ); + g_acc_z = -2. * cuda::warp_reduce_sum( g_acc_z ); + + if( (threadIdx.x % cuda::warp_size) == 0 ) { atomicAdd( EXC_GRAD + 3*iCen + 0, g_acc_x ); atomicAdd( EXC_GRAD + 3*iCen + 1, g_acc_y ); atomicAdd( EXC_GRAD + 3*iCen + 2, g_acc_z ); @@ -234,15 +947,40 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel( } -void increment_exc_grad_gga( size_t nshell, ShellToTaskDevice* shell_to_task, - XCDeviceTask* device_tasks, double* EXC_GRAD, device_queue queue ) { +void increment_exc_grad_mgga( integrator_ks_scheme ks_scheme, size_t nshell, bool need_lapl, + ShellToTaskDevice* shell_to_task, XCDeviceTask* device_tasks, + double* EXC_GRAD, bool with_weight_derivatives, device_queue queue ) { + + if(need_lapl) GAUXC_GENERIC_EXCEPTION("CUDA + MGGA/LAPL EXC GRAD NYI"); cudaStream_t stream = queue.queue_as(); dim3 threads(512), blocks(1,1,nshell); - increment_exc_grad_gga_kernel<<>>( - nshell, shell_to_task, device_tasks, EXC_GRAD - ); + switch(ks_scheme) { + case RKS: + if (with_weight_derivatives) { + increment_exc_grad_mgga_rks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } else { + increment_exc_grad_mgga_rks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } + break; + case UKS: + if (with_weight_derivatives) { + increment_exc_grad_mgga_uks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } else { + increment_exc_grad_mgga_uks_kernel<<>>( + nshell, shell_to_task, device_tasks, EXC_GRAD + ); + } + break; + default: GAUXC_GENERIC_EXCEPTION("GGA EXC GRAD + GKS NYI"); + } } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu index 4cb165a8..a000efe7 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/pack_submat.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu index 6a5cdfa0..91ccb294 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/symmetrize_mat.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu index 6aea5225..3c5f2020 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu @@ -1,422 +1,26 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #include "device/common/uvvars.hpp" #include "cuda_extensions.hpp" -#include "device_specific/cuda_device_constants.hpp" #include -#include "device_specific/cuda_util.hpp" -#include "device/xc_device_data.hpp" - -namespace GauXC { - -#define VVAR_KERNEL_SM_BLOCK 32 -#define GGA_KERNEL_SM_WARPS 16 -#define MGGA_KERNEL_SM_BLOCK 32 - -__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { - // eval_vvars populated uvar storage already in the case of LDA+RKS - return; -} - -__global__ void eval_uvars_lda_uks_kernel( size_t ntasks, - XCDeviceTask* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - - auto* den_pos_eval_device = task.den_s; - auto* den_neg_eval_device = task.den_z; - - - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - - if( tid < npts ) { - const auto ps = den_pos_eval_device[ tid ]; - const auto pz = den_neg_eval_device[ tid ]; - den_pos_eval_device[ tid ] = 0.5*(ps + pz); - den_neg_eval_device[ tid ] = 0.5*(ps - pz); - - } -} - -__global__ void eval_uvars_lda_gks_kernel( size_t ntasks, - XCDeviceTask* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - - auto* den_z_eval_device = task.den_s; - auto* den_s_eval_device = task.den_z; - auto* den_y_eval_device = task.den_y; - auto* den_x_eval_device = task.den_x; - auto* K_z_eval_device = task.K_z; - auto* K_y_eval_device = task.K_y; - auto* K_x_eval_device = task.K_x; - const double dtolsq = 1e-24; // TODO: make variable - - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - - if( tid < npts ) { - const auto ps = den_s_eval_device[ tid ]; - const auto pz = den_z_eval_device[ tid ]; - const auto py = den_y_eval_device[ tid ]; - const auto px = den_x_eval_device[ tid ]; - const auto mtemp = pz*pz + px*px + py*py; - double mnorm = 0.; - - if (mtemp > dtolsq) { - const double inv_mnorm = rsqrt(mtemp); - mnorm = 1./inv_mnorm; - K_z_eval_device[ tid ] = pz * inv_mnorm; - K_y_eval_device[ tid ] = py * inv_mnorm; - K_x_eval_device[ tid ] = px * inv_mnorm; - } - else { - mnorm = (1. / 3.) * (px + py + pz); - K_z_eval_device[ tid ] = 1. / 3.; - K_y_eval_device[ tid ] = 1. / 3.; - K_x_eval_device[ tid ] = 1. / 3.; - } - - den_s_eval_device[ tid ] = 0.5*(ps + mnorm); - den_z_eval_device[ tid ] = 0.5*(ps - mnorm); - - } -} - - -__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - const auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - - const auto* dden_sx_eval_device = task.dden_sx; - const auto* dden_sy_eval_device = task.dden_sy; - const auto* dden_sz_eval_device = task.dden_sz; - auto* gamma_eval_device = task.gamma; - - const int tid = threadIdx.x + blockIdx.x * blockDim.x; - - if( tid < npts ) { - const double dx = dden_sx_eval_device[ tid ]; - const double dy = dden_sy_eval_device[ tid ]; - const double dz = dden_sz_eval_device[ tid ]; - - gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz; - - } - -} - -__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - const auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - - auto* den_pos_eval_device = task.den_s; - const auto* den_pos_x_eval_device = task.dden_sx; - const auto* den_pos_y_eval_device = task.dden_sy; - const auto* den_pos_z_eval_device = task.dden_sz; - - auto* den_neg_eval_device = task.den_z; - const auto* den_neg_x_eval_device = task.dden_zx; - const auto* den_neg_y_eval_device = task.dden_zy; - const auto* den_neg_z_eval_device = task.dden_zz; - - auto* gamma_pp_eval_device = task.gamma_pp; - auto* gamma_pm_eval_device = task.gamma_pm; - auto* gamma_mm_eval_device = task.gamma_mm; - - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if( tid < npts ) { - const double ps = den_pos_eval_device[ tid ]; - const double pz = den_neg_eval_device[ tid ]; - const double dndx = den_pos_x_eval_device[ tid ]; - const double dndy = den_pos_y_eval_device[ tid ]; - const double dndz = den_pos_z_eval_device[ tid ]; - const double dMzdx = den_neg_x_eval_device[ tid ]; - const double dMzdy = den_neg_y_eval_device[ tid ]; - const double dMzdz = den_neg_z_eval_device[ tid ]; - - // (del n).(del n) - const auto dn_sq = dndx*dndx + dndy*dndy + dndz*dndz; - // (del Mz).(del Mz) - const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz; - // (del n).(del Mz) - const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz; - - gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz; - gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq); - gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz; - - den_pos_eval_device[ tid ] = 0.5*(ps + pz); - den_neg_eval_device[ tid ] = 0.5*(ps - pz); - } - -} - -__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - const auto& task = tasks_device[ batch_idx ]; - const auto npts = task.npts; - - auto* den_s_eval_device = task.den_s; - const auto* dden_sx_eval_device = task.dden_sx; - const auto* dden_sy_eval_device = task.dden_sy; - const auto* dden_sz_eval_device = task.dden_sz; - - auto* den_z_eval_device = task.den_z; - const auto* dden_zx_eval_device = task.dden_zx; - const auto* dden_zy_eval_device = task.dden_zy; - const auto* dden_zz_eval_device = task.dden_zz; - - const auto* den_y_eval_device = task.den_y; - const auto* dden_yx_eval_device = task.dden_yx; - const auto* dden_yy_eval_device = task.dden_yy; - const auto* dden_yz_eval_device = task.dden_yz; - - const auto* den_x_eval_device = task.den_x; - const auto* dden_xx_eval_device = task.dden_xx; - const auto* dden_xy_eval_device = task.dden_xy; - const auto* dden_xz_eval_device = task.dden_xz; - - auto* gamma_pp_eval_device = task.gamma_pp; - auto* gamma_pm_eval_device = task.gamma_pm; - auto* gamma_mm_eval_device = task.gamma_mm; - - auto* H_z_eval_device = task.H_z; - auto* H_y_eval_device = task.H_y; - auto* H_x_eval_device = task.H_x; - auto* K_z_eval_device = task.K_z; - auto* K_y_eval_device = task.K_y; - auto* K_x_eval_device = task.K_x; - - const double dtolsq = 1e-24; // TODO: make variable - - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - if( tid < npts ) { - const double dndz = dden_sz_eval_device[ tid ]; - const double dndy = dden_sy_eval_device[ tid ]; - const double dndx = dden_sx_eval_device[ tid ]; - - const double dMzdz = dden_zz_eval_device[ tid ]; - const double dMzdy = dden_zy_eval_device[ tid ]; - const double dMzdx = dden_zx_eval_device[ tid ]; - - const double dMydz = dden_yz_eval_device[ tid ]; - const double dMydy = dden_yy_eval_device[ tid ]; - const double dMydx = dden_yx_eval_device[ tid ]; - - const double dMxdz = dden_xz_eval_device[ tid ]; - const double dMxdy = dden_xy_eval_device[ tid ]; - const double dMxdx = dden_xx_eval_device[ tid ]; - - const auto ps = den_s_eval_device[ tid ]; - const auto pz = den_z_eval_device[ tid ]; - const auto py = den_y_eval_device[ tid ]; - const auto px = den_x_eval_device[ tid ]; - - const auto mtemp = pz*pz + px*px + py*py; - double mnorm = 0.; - - const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz; - const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz; - const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz; - const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz; - - const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz; - const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz; - const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz; - - const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely; - const auto s_sum = - dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py; - - const auto inv_sqsum2 = - rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx + - dels_dot_dely * dels_dot_dely); - const auto sqsum2 = 1./inv_sqsum2; - - double sign = 1.; - if( signbit(s_sum)) - sign = -1.; - - - if (mtemp > dtolsq) { - const double inv_mnorm = rsqrt(mtemp); - mnorm = 1./inv_mnorm; - K_z_eval_device[ tid ] = pz * inv_mnorm; - K_y_eval_device[ tid ] = py * inv_mnorm; - K_x_eval_device[ tid ] = px * inv_mnorm; - H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2; - H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2; - H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2; - } - else { - mnorm = (1. / 3.) * (px + py + pz); - K_z_eval_device[ tid ] = 1. / 3.; - K_y_eval_device[ tid ] = 1. / 3.; - K_x_eval_device[ tid ] = 1. / 3.; - - H_z_eval_device[ tid ] = sign / 3.; - H_y_eval_device[ tid ] = sign / 3.; - H_x_eval_device[ tid ] = sign / 3.; - } - - gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2; - gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum); - gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2; - - den_s_eval_device[ tid ] = 0.5*(ps + mnorm); - den_z_eval_device[ tid ] = 0.5*(ps - mnorm); - - } - -} - -template -__global__ void eval_uvars_mgga_rks_kernel( size_t ntasks, - XCDeviceTask* tasks_device ) { - - constexpr auto warp_size = cuda::warp_size; - //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - - auto* tau_eval_device = task.tau; - decltype(tau_eval_device) lapl_eval_device = nullptr; - if constexpr (need_lapl) { - lapl_eval_device = task.denlapl; - } - - //const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr; - if constexpr (need_lapl) { - basis_lapl_eval_device = task.d2bflapl; - } - - //const auto* den_basis_prod_device = task.zmat; - const auto* den_basis_dx_prod_device = task.xmat_x; - const auto* den_basis_dy_prod_device = task.xmat_y; - const auto* den_basis_dz_prod_device = task.xmat_z; - decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr; - if constexpr (need_lapl) { - den_basis_prod_device = task.zmat; - } - - __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1]; - - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; - bid_y < npts; - bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - if constexpr (need_lapl) - den_shared[3][threadIdx.x][sm_y] = 0.; - - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts; - const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts; - const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts; - - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; - - - den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ]; - - - if constexpr (need_lapl) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts; - den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - } - __syncthreads(); - - - for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - - register double tx_reg = den_shared[0][sm_y][threadIdx.x]; - register double ty_reg = den_shared[1][sm_y][threadIdx.x]; - register double tz_reg = den_shared[2][sm_y][threadIdx.x]; - // Warp blocks are stored col major - register double tau_reg = 0.0; - tau_reg = 0.5 * cuda::warp_reduce_sum( tx_reg ); - tau_reg += 0.5 * cuda::warp_reduce_sum( ty_reg ); - tau_reg += 0.5 * cuda::warp_reduce_sum( tz_reg ); - - register double lapl_reg = 0.0; - if constexpr (need_lapl) { - lapl_reg = den_shared[3][sm_y][threadIdx.x]; - lapl_reg = cuda::warp_reduce_sum(lapl_reg); - lapl_reg = 2. * lapl_reg + 4. * tau_reg; - } - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( tau_eval_device + tid_y, tau_reg ); - if constexpr (need_lapl) { - atomicAdd( lapl_eval_device + tid_y, lapl_reg ); - } - } - } - __syncthreads(); - } - } -} +#include "uvvars_lda.hpp" +#include "uvvars_gga.hpp" +#include "uvvars_mgga.hpp" +namespace GauXC { #define EVAL_UVARS_KERNEL(xc_approx) \ cudaStream_t stream = queue.queue_as(); \ - dim3 blocks( util::div_ceil( npts_max, threads.x ), \ - 1, \ - ntasks ); \ switch ( ks_scheme ) { \ case RKS: \ eval_uvars_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ @@ -428,264 +32,197 @@ __global__ void eval_uvars_mgga_rks_kernel( size_t ntasks, eval_uvars_##xc_approx##_gks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ break; \ default: \ - GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate UV vars" ); \ + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate U vars" ); \ } + +#define EVAL_TMAT_KERNEL(xc_approx) \ + cudaStream_t stream = queue.queue_as(); \ + switch ( ks_scheme ) { \ + case RKS: \ + eval_tmat_##xc_approx##_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks); \ + break; \ + case UKS: \ + eval_tmat_##xc_approx##_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks); \ + break; \ + case GKS: \ + GAUXC_GENERIC_EXCEPTION( "GKS + evaluate trial U vars NYI" ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate U vars" ); \ + } + + +#define EVAL_VVARS_KERNEL(xc_approx) \ + cudaStream_t stream = queue.queue_as(); \ + switch ( den_select ) { \ + case DEN_S: \ + eval_vvar_##xc_approx##_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case DEN_Z: \ + eval_vvar_##xc_approx##_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case DEN_Y: \ + eval_vvar_##xc_approx##_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + case DEN_X: \ + eval_vvar_##xc_approx##_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); \ + break; \ + default: \ + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate V vars" ); \ + } + +// Internal implementation with trial parameter +void eval_tmat_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 ); + dim3 blocks( util::div_ceil( npts_max, threads.x ), 1, ntasks ); + EVAL_TMAT_KERNEL(lda); +} + void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, XCDeviceTask* device_tasks, device_queue queue ) { dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 ); + dim3 blocks( util::div_ceil( npts_max, threads.x ), 1, ntasks ); EVAL_UVARS_KERNEL(lda); } +// Internal implementation with trial as template parameter +template +void eval_vvars_lda_impl( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 ); + dim3 blocks( util::div_ceil( nbf_max, threads.x ), + util::div_ceil( npts_max, threads.y ), + ntasks ); + EVAL_VVARS_KERNEL(lda); +} +void eval_vvars_lda( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + eval_vvars_lda_impl(ntasks, nbf_max, npts_max, den_select, device_tasks, queue); +} +void eval_vvars_lda_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + eval_vvars_lda_impl(ntasks, nbf_max, npts_max, den_select, device_tasks, queue); +} - +// Internal implementation with trial parameter +void eval_tmat_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 ); + dim3 blocks( util::div_ceil( npts_max, threads.x ), 1, ntasks ); + EVAL_TMAT_KERNEL(gga); +} void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, XCDeviceTask* device_tasks, device_queue queue ) { dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 ); + dim3 blocks( util::div_ceil( npts_max, threads.x ), 1, ntasks ); EVAL_UVARS_KERNEL(gga); } - -void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, - int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks, - device_queue queue ) { - // TODO: This interface should be unified with the lda/gga interfaces - cudaStream_t stream = queue.queue_as(); - - // U Variables - { - dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); - dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), +// Internal implementation with trial as template parameter +template +void eval_vvars_gga_impl( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 ); + dim3 blocks( util::div_ceil( nbf_max, threads.x ), + util::div_ceil( npts_max, threads.y ), ntasks ); - if(do_lapl) - eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - else - eval_uvars_mgga_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - } - - // V variables (GAMMA) - dim3 threads( cuda::max_threads_per_thread_block ); - dim3 blocks( util::div_ceil( npts_total, threads.x ), - 1, - ntasks ); - eval_uvars_gga_rks_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + EVAL_VVARS_KERNEL(gga); +} +void eval_vvars_gga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + eval_vvars_gga_impl(ntasks, nbf_max, npts_max, den_select, device_tasks, queue); +} +void eval_vvars_gga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + XCDeviceTask* device_tasks, device_queue queue ) { + eval_vvars_gga_impl(ntasks, nbf_max, npts_max, den_select, device_tasks, queue); } +// Internal implementation with trial parameter +void eval_tmat_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) { + cudaStream_t stream = queue.queue_as(); + dim3 threads( GGA_KERNEL_SM_WARPS * cuda::warp_size, 1, 1 ); + dim3 blocks( util::div_ceil( npts_max, threads.x ), 1, ntasks ); - - - - -template -__global__ void eval_vvar_grad_kern( size_t ntasks, - XCDeviceTask* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - - double* den_eval_device = nullptr; - double* den_x_eval_device = nullptr; - double* den_y_eval_device = nullptr; - double* den_z_eval_device = nullptr; - - constexpr auto warp_size = cuda::warp_size; - - if constexpr (den_select == DEN_S) { - den_eval_device = task.den_s; - den_x_eval_device = task.dden_sx; - den_y_eval_device = task.dden_sy; - den_z_eval_device = task.dden_sz; + if(need_lapl) { + GAUXC_GENERIC_EXCEPTION("MGGA + LAPL + eval tmat NYI"); } - if constexpr (den_select == DEN_Z) { - den_eval_device = task.den_z; - den_x_eval_device = task.dden_zx; - den_y_eval_device = task.dden_zy; - den_z_eval_device = task.dden_zz; - } - if constexpr (den_select == DEN_Y) { - den_eval_device = task.den_y; - den_x_eval_device = task.dden_yx; - den_y_eval_device = task.dden_yy; - den_z_eval_device = task.dden_yz; - } - if constexpr (den_select == DEN_X) { - den_eval_device = task.den_x; - den_x_eval_device = task.dden_xx; - den_y_eval_device = task.dden_xy; - den_z_eval_device = task.dden_xz; - } - - const auto* basis_eval_device = task.bf; - const auto* dbasis_x_eval_device = task.dbfx; - const auto* dbasis_y_eval_device = task.dbfy; - const auto* dbasis_z_eval_device = task.dbfz; - - const auto* den_basis_prod_device = task.zmat; - - __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1]; - - for ( int bid_x = blockIdx.x * blockDim.x; - bid_x < nbf; - bid_x += blockDim.x * gridDim.x ) { - - for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; - bid_y < npts; - bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) { - - for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - den_shared[0][threadIdx.x][sm_y] = 0.; - den_shared[1][threadIdx.x][sm_y] = 0.; - den_shared[2][threadIdx.x][sm_y] = 0.; - den_shared[3][threadIdx.x][sm_y] = 0.; - - if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { - const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; - const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; - const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; - const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; - const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; - - den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; - } - } - __syncthreads(); - - - for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { - const int tid_y = bid_y + sm_y; - register double den_reg = den_shared[0][sm_y][threadIdx.x]; - register double dx_reg = den_shared[1][sm_y][threadIdx.x]; - register double dy_reg = den_shared[2][sm_y][threadIdx.x]; - register double dz_reg = den_shared[3][sm_y][threadIdx.x]; - - // Warp blocks are stored col major - den_reg = cuda::warp_reduce_sum( den_reg ); - dx_reg = 2. * cuda::warp_reduce_sum( dx_reg ); - dy_reg = 2. * cuda::warp_reduce_sum( dy_reg ); - dz_reg = 2. * cuda::warp_reduce_sum( dz_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); - atomicAdd( den_x_eval_device + tid_y, dx_reg ); - atomicAdd( den_y_eval_device + tid_y, dy_reg ); - atomicAdd( den_z_eval_device + tid_y, dz_reg ); - } - } - __syncthreads(); - } + if(ks_scheme == RKS) { + eval_tmat_mgga_rks_kernel<<>>(ntasks, device_tasks); + } else if(ks_scheme == UKS) { + eval_tmat_mgga_uks_kernel<<>>(ntasks, device_tasks); + } else { + GAUXC_GENERIC_EXCEPTION("GKS + MGGA + DEVICE NYI"); } - } +void eval_uvars_mgga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) { + cudaStream_t stream = queue.queue_as(); -template -__global__ void eval_vvar_kern( size_t ntasks, - XCDeviceTask* tasks_device ) { - - const int batch_idx = blockIdx.z; - if( batch_idx >= ntasks ) return; - - auto& task = tasks_device[ batch_idx ]; - - const auto npts = task.npts; - const auto nbf = task.bfn_screening.nbe; - - double* den_eval_device = nullptr; - // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored - if constexpr (den_select == DEN_S) den_eval_device = task.den_s; - if constexpr (den_select == DEN_Z) den_eval_device = task.den_z; - if constexpr (den_select == DEN_Y) den_eval_device = task.den_y; - if constexpr (den_select == DEN_X) den_eval_device = task.den_x; - - const auto* basis_eval_device = task.bf; - - const auto* den_basis_prod_device = task.zmat; + // Evaluate GAMMA + eval_uvars_gga(ntasks, npts_max, ks_scheme, device_tasks, queue); - const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - - register double den_reg = 0.; - - if( tid_x < nbf and tid_y < npts ) { - - const double* bf_col = basis_eval_device + tid_x*npts; - const double* db_col = den_basis_prod_device + tid_x*npts; - - den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; - - } - - // Warp blocks are stored col major - constexpr auto warp_size = cuda::warp_size; - //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; - den_reg = cuda::warp_reduce_sum( den_reg ); - - - if( threadIdx.x == 0 and tid_y < npts ) { - atomicAdd( den_eval_device + tid_y, den_reg ); + if(ks_scheme == RKS) { + return; // Nothing left to do + } else if(ks_scheme == UKS) { + dim3 threads( cuda::max_warps_per_thread_block * cuda::warp_size, 1, 1 ); + dim3 blocks( util::div_ceil( npts_max, threads.x ), 1, ntasks ); + if(need_lapl) { + eval_uvars_mgga_uks_kernel<<>>(ntasks, device_tasks); + } else { + eval_uvars_mgga_uks_kernel<<>>(ntasks, device_tasks); + } + } else { + GAUXC_GENERIC_EXCEPTION("GKS + MGGA + DEVICE NYI"); } - } +// Internal implementation with trial as template parameter +template +void eval_vvars_mgga_impl( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) { + // First evaluate GGA variables + eval_vvars_gga_impl(ntasks, nbf_max, npts_max, den_select, device_tasks, queue); - - -void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select, - XCDeviceTask* device_tasks, device_queue queue ) { + dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block, 1 ); + dim3 blocks( util::div_ceil( nbf_max, threads.x ), + util::div_ceil( npts_max, threads.y ), + ntasks ); cudaStream_t stream = queue.queue_as(); - dim3 threads; - dim3 blocks; - if( do_grad ) { - threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 ); - blocks = dim3( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )), - std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )), - ntasks ); - } else { - threads = dim3( cuda::warp_size, cuda::max_warps_per_thread_block, 1 ); - blocks = dim3( util::div_ceil( nbf_max, threads.x ), - util::div_ceil( npts_max, threads.y ), - ntasks ); - } - switch( den_select ) { - case DEN_S: - if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - break; - case DEN_Z: - if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - break; - case DEN_Y: - if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + switch ( den_select ) { + case DEN_S: + if (need_lapl) { + eval_vvar_mgga_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + } else { + eval_vvar_mgga_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + } break; - case DEN_X: - if (do_grad) eval_vvar_grad_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); - else eval_vvar_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + case DEN_Z: + if (need_lapl) { + eval_vvar_mgga_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + } else { + eval_vvar_mgga_kern<<< blocks, threads, 0, stream >>>( ntasks, device_tasks ); + } break; default: - GAUXC_GENERIC_EXCEPTION( "eval_vvar called with improper density selected" ); + GAUXC_GENERIC_EXCEPTION( "Unexpected KS scheme when attempting to evaluate V vars" ); } - } - - - - +void eval_vvars_mgga( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) { + eval_vvars_mgga_impl(ntasks, nbf_max, npts_max, den_select, need_lapl, device_tasks, queue); +} +void eval_vvars_mgga_trial( size_t ntasks, int32_t nbf_max, int32_t npts_max, density_id den_select, + bool need_lapl, XCDeviceTask* device_tasks, device_queue queue ) { + eval_vvars_mgga_impl(ntasks, nbf_max, npts_max, den_select, need_lapl, device_tasks, queue); +} } diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_gga.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_gga.hpp new file mode 100644 index 00000000..9b466e24 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_gga.hpp @@ -0,0 +1,555 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "device_specific/cuda_device_constants.hpp" +#include "device_specific/cuda_util.hpp" +#include "device/xc_device_data.hpp" + +#define VVAR_KERNEL_SM_BLOCK 32 +#define GGA_KERNEL_SM_WARPS 16 + +namespace GauXC { + +template +__global__ void eval_vvar_gga_kern( size_t ntasks, + XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + double* den_eval_device = nullptr; + double* den_x_eval_device = nullptr; + double* den_y_eval_device = nullptr; + double* den_z_eval_device = nullptr; + + constexpr auto warp_size = cuda::warp_size; + + if constexpr (trial){ + if constexpr (den_select == DEN_S) { + den_eval_device = task.tden_s; + den_x_eval_device = task.tdden_sx; + den_y_eval_device = task.tdden_sy; + den_z_eval_device = task.tdden_sz; + } + if constexpr (den_select == DEN_Z) { + den_eval_device = task.tden_z; + den_x_eval_device = task.tdden_zx; + den_y_eval_device = task.tdden_zy; + den_z_eval_device = task.tdden_zz; + } + if constexpr (den_select == DEN_Y) { + den_eval_device = task.tden_y; + den_x_eval_device = task.tdden_yx; + den_y_eval_device = task.tdden_yy; + den_z_eval_device = task.tdden_yz; + } + if constexpr (den_select == DEN_X) { + den_eval_device = task.tden_x; + den_x_eval_device = task.tdden_xx; + den_y_eval_device = task.tdden_xy; + den_z_eval_device = task.tdden_xz; + } + }else{ + if constexpr (den_select == DEN_S) { + den_eval_device = task.den_s; + den_x_eval_device = task.dden_sx; + den_y_eval_device = task.dden_sy; + den_z_eval_device = task.dden_sz; + } + if constexpr (den_select == DEN_Z) { + den_eval_device = task.den_z; + den_x_eval_device = task.dden_zx; + den_y_eval_device = task.dden_zy; + den_z_eval_device = task.dden_zz; + } + if constexpr (den_select == DEN_Y) { + den_eval_device = task.den_y; + den_x_eval_device = task.dden_yx; + den_y_eval_device = task.dden_yy; + den_z_eval_device = task.dden_yz; + } + if constexpr (den_select == DEN_X) { + den_eval_device = task.den_x; + den_x_eval_device = task.dden_xx; + den_y_eval_device = task.dden_xy; + den_z_eval_device = task.dden_xz; + } + } + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + const auto* den_basis_prod_device = task.zmat; + + __shared__ double den_shared[4][warp_size][VVAR_KERNEL_SM_BLOCK+1]; + + for ( int bid_x = blockIdx.x * blockDim.x; + bid_x < nbf; + bid_x += blockDim.x * gridDim.x ) { + + for ( int bid_y = blockIdx.y * VVAR_KERNEL_SM_BLOCK; + bid_y < npts; + bid_y += VVAR_KERNEL_SM_BLOCK * gridDim.y ) { + + for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + den_shared[0][threadIdx.x][sm_y] = 0.; + den_shared[1][threadIdx.x][sm_y] = 0.; + den_shared[2][threadIdx.x][sm_y] = 0.; + den_shared[3][threadIdx.x][sm_y] = 0.; + + if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { + const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; + const double* bf_col = basis_eval_device + (bid_x + sm_y)*npts; + const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; + const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; + const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + + den_shared[0][threadIdx.x][sm_y] = bf_col [ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[1][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[2][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + den_shared[3][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + } + } + __syncthreads(); + + + for (int sm_y = threadIdx.y; sm_y < VVAR_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + const int tid_y = bid_y + sm_y; + register double den_reg = den_shared[0][sm_y][threadIdx.x]; + register double dx_reg = den_shared[1][sm_y][threadIdx.x]; + register double dy_reg = den_shared[2][sm_y][threadIdx.x]; + register double dz_reg = den_shared[3][sm_y][threadIdx.x]; + + // Warp blocks are stored col major + den_reg = cuda::warp_reduce_sum( den_reg ); + dx_reg = 2. * cuda::warp_reduce_sum( dx_reg ); + dy_reg = 2. * cuda::warp_reduce_sum( dy_reg ); + dz_reg = 2. * cuda::warp_reduce_sum( dz_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + atomicAdd( den_x_eval_device + tid_y, dx_reg ); + atomicAdd( den_y_eval_device + tid_y, dy_reg ); + atomicAdd( den_z_eval_device + tid_y, dz_reg ); + } + } + __syncthreads(); + } + } + +} + +__global__ void eval_uvars_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + auto* gamma_eval_device = task.gamma; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + if( tid < npts ) { + const double dx = dden_sx_eval_device[ tid ]; + const double dy = dden_sy_eval_device[ tid ]; + const double dz = dden_sz_eval_device[ tid ]; + + gamma_eval_device[ tid ] = dx*dx + dy*dy + dz*dz; + } + +} + +__global__ void eval_tmat_gga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + const auto* tdden_sx_eval_device = task.tdden_sx; + const auto* tdden_sy_eval_device = task.tdden_sy; + const auto* tdden_sz_eval_device = task.tdden_sz; + + const auto* weight_device = task.weights; + const auto* vgamma_device = task.vgamma; + const auto* v2rho2_device = task.v2rho2; + const auto* v2rhogamma_device = task.v2rhogamma; + const auto* v2gamma2_device = task.v2gamma2; + const auto* trho_device = task.tden_s; + + auto* FXC_A_device = task.FXC_A_s; + auto* FXC_Bx_device = task.FXC_Bx_s; + auto* FXC_By_device = task.FXC_By_s; + auto* FXC_Bz_device = task.FXC_Bz_s; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + if( tid < npts ) { + const auto dx = dden_sx_eval_device[ tid ]; + const auto dy = dden_sy_eval_device[ tid ]; + const auto dz = dden_sz_eval_device[ tid ]; + const auto tdx = tdden_sx_eval_device[ tid ]; + const auto tdy = tdden_sy_eval_device[ tid ]; + const auto tdz = tdden_sz_eval_device[ tid ]; + const auto tgamma = tdx*dx + tdy*dy + tdz*dz; + + const auto FXC_A = v2rho2_device[ tid ] * trho_device[ tid ] + 2.0 * v2rhogamma_device[tid] * tgamma; + const auto B_coef = v2rhogamma_device[tid] * trho_device[tid] + 2.0 * v2gamma2_device[tid] * tgamma; + FXC_A_device[ tid ] = weight_device[ tid ] * FXC_A ; + FXC_Bx_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dx + vgamma_device[ tid ] * tdx ); + FXC_By_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dy + vgamma_device[ tid ] * tdy ); + FXC_Bz_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dz + vgamma_device[ tid ] * tdz ); + } + +} + +__global__ void eval_uvars_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + auto* den_pos_eval_device = task.den_s; + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + + auto* den_neg_eval_device = task.den_z; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const double ps = den_pos_eval_device[ tid ]; + const double pz = den_neg_eval_device[ tid ]; + const double dndx = den_pos_x_eval_device[ tid ]; + const double dndy = den_pos_y_eval_device[ tid ]; + const double dndz = den_pos_z_eval_device[ tid ]; + const double dMzdx = den_neg_x_eval_device[ tid ]; + const double dMzdy = den_neg_y_eval_device[ tid ]; + const double dMzdz = den_neg_z_eval_device[ tid ]; + + // (del n).(del n) + const auto dn_sq = dndx*dndx + dndy*dndy + dndz*dndz; + // (del Mz).(del Mz) + const auto dMz_sq = dMzdx*dMzdx + dMzdy*dMzdy + dMzdz*dMzdz; + // (del n).(del Mz) + const auto dn_dMz = dndx*dMzdx + dndy*dMzdy + dndz*dMzdz; + + gamma_pp_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) + 0.5*dn_dMz; + gamma_pm_eval_device[ tid ] = 0.25*(dn_sq - dMz_sq); + gamma_mm_eval_device[ tid ] = 0.25*(dn_sq + dMz_sq) - 0.5*dn_dMz; + + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); + } + +} + +__global__ void eval_tmat_gga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* tden_s_device = task.tden_s; + const auto* tden_z_device = task.tden_z; + const auto* weight_device = task.weights; + + const auto* tden_pos_x_eval_device = task.tdden_sx; + const auto* tden_pos_y_eval_device = task.tdden_sy; + const auto* tden_pos_z_eval_device = task.tdden_sz; + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + + const auto* tden_neg_x_eval_device = task.tdden_zx; + const auto* tden_neg_y_eval_device = task.tdden_zy; + const auto* tden_neg_z_eval_device = task.tdden_zz; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + const auto* vgamma_aa_device = task.vgamma_pp; + const auto* vgamma_ab_device = task.vgamma_pm; + const auto* vgamma_bb_device = task.vgamma_mm; + const auto* v2rho2_a_a_device = task.v2rho2_a_a; + const auto* v2rho2_a_b_device = task.v2rho2_a_b; + const auto* v2rho2_b_b_device = task.v2rho2_b_b; + const auto* v2rhogamma_a_aa_device = task.v2rhogamma_a_aa; + const auto* v2rhogamma_a_ab_device = task.v2rhogamma_a_ab; + const auto* v2rhogamma_a_bb_device = task.v2rhogamma_a_bb; + const auto* v2rhogamma_b_aa_device = task.v2rhogamma_b_aa; + const auto* v2rhogamma_b_ab_device = task.v2rhogamma_b_ab; + const auto* v2rhogamma_b_bb_device = task.v2rhogamma_b_bb; + const auto* v2gamma2_aa_aa_device = task.v2gamma2_aa_aa; + const auto* v2gamma2_aa_ab_device = task.v2gamma2_aa_ab; + const auto* v2gamma2_aa_bb_device = task.v2gamma2_aa_bb; + const auto* v2gamma2_ab_ab_device = task.v2gamma2_ab_ab; + const auto* v2gamma2_ab_bb_device = task.v2gamma2_ab_bb; + const auto* v2gamma2_bb_bb_device = task.v2gamma2_bb_bb; + + auto* FXC_A_s_device = task.FXC_A_s; + auto* FXC_A_z_device = task.FXC_A_z; + auto* FXC_Bx_s_device = task.FXC_Bx_s; + auto* FXC_Bx_z_device = task.FXC_Bx_z; + auto* FXC_By_s_device = task.FXC_By_s; + auto* FXC_By_z_device = task.FXC_By_z; + auto* FXC_Bz_s_device = task.FXC_Bz_s; + auto* FXC_Bz_z_device = task.FXC_Bz_z; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const auto ps = tden_s_device[ tid ]; + const auto pz = tden_z_device[ tid ]; + const auto trho_a_device = 0.5*(ps + pz); + const auto trho_b_device = 0.5*(ps - pz); + + const auto tdndx = tden_pos_x_eval_device[ tid ]; + const auto tdndy = tden_pos_y_eval_device[ tid ]; + const auto tdndz = tden_pos_z_eval_device[ tid ]; + const auto tdMzdx = tden_neg_x_eval_device[ tid ]; + const auto tdMzdy = tden_neg_y_eval_device[ tid ]; + const auto tdMzdz = tden_neg_z_eval_device[ tid ]; + const auto tdden_a_x = 0.5*(tdndx + tdMzdx); + const auto tdden_a_y = 0.5*(tdndy + tdMzdy); + const auto tdden_a_z = 0.5*(tdndz + tdMzdz); + const auto tdden_b_x = 0.5*(tdndx - tdMzdx); + const auto tdden_b_y = 0.5*(tdndy - tdMzdy); + const auto tdden_b_z = 0.5*(tdndz - tdMzdz); + + const auto dndx = den_pos_x_eval_device[ tid ]; + const auto dndy = den_pos_y_eval_device[ tid ]; + const auto dndz = den_pos_z_eval_device[ tid ]; + const auto dMzdx = den_neg_x_eval_device[ tid ]; + const auto dMzdy = den_neg_y_eval_device[ tid ]; + const auto dMzdz = den_neg_z_eval_device[ tid ]; + const auto dden_a_x = 0.5*(dndx + dMzdx); + const auto dden_a_y = 0.5*(dndy + dMzdy); + const auto dden_a_z = 0.5*(dndz + dMzdz); + const auto dden_b_x = 0.5*(dndx - dMzdx); + const auto dden_b_y = 0.5*(dndy - dMzdy); + const auto dden_b_z = 0.5*(dndz - dMzdz); + + const auto tgamma_pp = tdden_a_x * dden_a_x + tdden_a_y * dden_a_y + tdden_a_z * dden_a_z; + const auto tgamma_pm = tdden_a_x * dden_b_x + tdden_a_y * dden_b_y + tdden_a_z * dden_b_z + + tdden_b_x * dden_a_x + tdden_b_y * dden_a_y + tdden_b_z * dden_a_z; + const auto tgamma_mm = tdden_b_x * dden_b_x + tdden_b_y * dden_b_y + tdden_b_z * dden_b_z; + + + const auto A_a = v2rho2_a_a_device[tid] * trho_a_device + 2.0 * v2rhogamma_a_aa_device[tid] * tgamma_pp + + v2rhogamma_a_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_a_bb_device[tid] * tgamma_mm + + v2rho2_a_b_device[tid] * trho_b_device; + const auto A_b = v2rho2_b_b_device[tid] * trho_b_device + 2.0 * v2rhogamma_b_bb_device[tid] * tgamma_mm + + v2rhogamma_b_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_b_aa_device[tid] * tgamma_pp + + v2rho2_a_b_device[tid] * trho_a_device; + FXC_A_s_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a + A_b); + FXC_A_z_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a - A_b); + // Calculate B coefficients for alpha spin + const double B_coef1_a = v2rhogamma_a_aa_device[tid] * trho_a_device + 2.0 * v2gamma2_aa_aa_device[tid] * tgamma_pp + + v2gamma2_aa_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_mm + + v2rhogamma_b_aa_device[tid] * trho_b_device; + + const double B_coef2_a = v2rhogamma_a_ab_device[tid] * trho_a_device + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + + v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm + + v2rhogamma_b_ab_device[tid] * trho_b_device; + + // Calculate gradient components for alpha spin + const double Bx_a = 2.0 * B_coef1_a * dden_a_x + B_coef2_a * dden_b_x + + 2.0 * vgamma_aa_device[tid] * tdden_a_x + vgamma_ab_device[tid] * tdden_b_x; + + const double By_a = 2.0 * B_coef1_a * dden_a_y + B_coef2_a * dden_b_y + + 2.0 * vgamma_aa_device[tid] * tdden_a_y + vgamma_ab_device[tid] * tdden_b_y; + + const double Bz_a = 2.0 * B_coef1_a * dden_a_z + B_coef2_a * dden_b_z + + 2.0 * vgamma_aa_device[tid] * tdden_a_z + vgamma_ab_device[tid] * tdden_b_z; + + // Calculate B coefficients for beta spin + const double B_coef1_b = v2rhogamma_b_bb_device[tid] * trho_b_device + 2.0 * v2gamma2_bb_bb_device[tid] * tgamma_mm + + v2gamma2_ab_bb_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_pp + + v2rhogamma_a_bb_device[tid] * trho_a_device; + + const double B_coef2_b = v2rhogamma_b_ab_device[tid] * trho_b_device + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm + + v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + + v2rhogamma_a_ab_device[tid] * trho_a_device; + + const double Bx_b = 2.0 * B_coef1_b * dden_b_x + B_coef2_b * dden_a_x + + 2.0 * vgamma_bb_device[tid] * tdden_b_x + vgamma_ab_device[tid] * tdden_a_x; + + const double By_b = 2.0 * B_coef1_b * dden_b_y + B_coef2_b * dden_a_y + + 2.0 * vgamma_bb_device[tid] * tdden_b_y + vgamma_ab_device[tid] * tdden_a_y; + + const double Bz_b = 2.0 * B_coef1_b * dden_b_z + B_coef2_b * dden_a_z + + 2.0 * vgamma_bb_device[tid] * tdden_b_z + vgamma_ab_device[tid] * tdden_a_z; + + FXC_Bx_s_device[tid] = 0.5 * weight_device[tid] * (Bx_a + Bx_b); + FXC_By_s_device[tid] = 0.5 * weight_device[tid] * (By_a + By_b); + FXC_Bz_s_device[tid] = 0.5 * weight_device[tid] * (Bz_a + Bz_b); + FXC_Bx_z_device[tid] = 0.5 * weight_device[tid] * (Bx_a - Bx_b); + FXC_By_z_device[tid] = 0.5 * weight_device[tid] * (By_a - By_b); + FXC_Bz_z_device[tid] = 0.5 * weight_device[tid] * (Bz_a - Bz_b); + + + } + +} + +__global__ void eval_uvars_gga_gks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + auto* den_s_eval_device = task.den_s; + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + + auto* den_z_eval_device = task.den_z; + const auto* dden_zx_eval_device = task.dden_zx; + const auto* dden_zy_eval_device = task.dden_zy; + const auto* dden_zz_eval_device = task.dden_zz; + + const auto* den_y_eval_device = task.den_y; + const auto* dden_yx_eval_device = task.dden_yx; + const auto* dden_yy_eval_device = task.dden_yy; + const auto* dden_yz_eval_device = task.dden_yz; + + const auto* den_x_eval_device = task.den_x; + const auto* dden_xx_eval_device = task.dden_xx; + const auto* dden_xy_eval_device = task.dden_xy; + const auto* dden_xz_eval_device = task.dden_xz; + + auto* gamma_pp_eval_device = task.gamma_pp; + auto* gamma_pm_eval_device = task.gamma_pm; + auto* gamma_mm_eval_device = task.gamma_mm; + + auto* H_z_eval_device = task.H_z; + auto* H_y_eval_device = task.H_y; + auto* H_x_eval_device = task.H_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + + const double dtolsq = 1e-24; // TODO: make variable + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const double dndz = dden_sz_eval_device[ tid ]; + const double dndy = dden_sy_eval_device[ tid ]; + const double dndx = dden_sx_eval_device[ tid ]; + + const double dMzdz = dden_zz_eval_device[ tid ]; + const double dMzdy = dden_zy_eval_device[ tid ]; + const double dMzdx = dden_zx_eval_device[ tid ]; + + const double dMydz = dden_yz_eval_device[ tid ]; + const double dMydy = dden_yy_eval_device[ tid ]; + const double dMydx = dden_yx_eval_device[ tid ]; + + const double dMxdz = dden_xz_eval_device[ tid ]; + const double dMxdy = dden_xy_eval_device[ tid ]; + const double dMxdx = dden_xx_eval_device[ tid ]; + + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + const auto dels_dot_dels = dndx * dndx + dndy * dndy + dndz * dndz; + const auto delz_dot_delz = dMzdx * dMzdx + dMzdy * dMzdy + dMzdz * dMzdz; + const auto delx_dot_delx = dMxdx * dMxdx + dMxdy * dMxdy + dMxdz * dMxdz; + const auto dely_dot_dely = dMydx * dMydx + dMydy * dMydy + dMydz * dMydz; + + const auto dels_dot_delz = dndx * dMzdx + dndy * dMzdy + dndz * dMzdz; + const auto dels_dot_delx = dndx * dMxdx + dndy * dMxdy + dndz * dMxdz; + const auto dels_dot_dely = dndx * dMydx + dndy * dMydy + dndz * dMydz; + + const auto sum = delz_dot_delz + delx_dot_delx + dely_dot_dely; + const auto s_sum = + dels_dot_delz * pz + dels_dot_delx * px + dels_dot_dely * py; + + const auto inv_sqsum2 = + rsqrt(dels_dot_delz * dels_dot_delz + dels_dot_delx * dels_dot_delx + + dels_dot_dely * dels_dot_dely); + const auto sqsum2 = 1./inv_sqsum2; + + double sign = 1.; + if( signbit(s_sum)) + sign = -1.; + + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + H_z_eval_device[ tid ] = sign * dels_dot_delz * inv_sqsum2; + H_y_eval_device[ tid ] = sign * dels_dot_dely * inv_sqsum2; + H_x_eval_device[ tid ] = sign * dels_dot_delx * inv_sqsum2; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + + H_z_eval_device[ tid ] = sign / 3.; + H_y_eval_device[ tid ] = sign / 3.; + H_x_eval_device[ tid ] = sign / 3.; + } + + gamma_pp_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) + 0.5*sign*sqsum2; + gamma_pm_eval_device[ tid ] = 0.25*(dels_dot_dels - sum); + gamma_mm_eval_device[ tid ] = 0.25*(dels_dot_dels + sum) - 0.5*sign*sqsum2; + + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); + + } + +} + +} // namespace GauXC diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_lda.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_lda.hpp new file mode 100644 index 00000000..54dc5043 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_lda.hpp @@ -0,0 +1,208 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "device_specific/cuda_device_constants.hpp" +#include "device_specific/cuda_util.hpp" +#include "device/xc_device_data.hpp" + +namespace GauXC { + +template +__global__ void eval_vvar_lda_kern( size_t ntasks, + XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + double* den_eval_device = nullptr; + // use the "U" variable (+/- for UKS) even though at this point the density (S/Z) is stored + if constexpr (trial){ + if constexpr (den_select == DEN_S) den_eval_device = task.tden_s; + if constexpr (den_select == DEN_Z) den_eval_device = task.tden_z; + if constexpr (den_select == DEN_Y) den_eval_device = task.tden_y; + if constexpr (den_select == DEN_X) den_eval_device = task.tden_x; + }else{ + if constexpr (den_select == DEN_S) den_eval_device = task.den_s; + if constexpr (den_select == DEN_Z) den_eval_device = task.den_z; + if constexpr (den_select == DEN_Y) den_eval_device = task.den_y; + if constexpr (den_select == DEN_X) den_eval_device = task.den_x; + } + + const auto* basis_eval_device = task.bf; + + const auto* den_basis_prod_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + register double den_reg = 0.; + + if( tid_x < nbf and tid_y < npts ) { + + const double* bf_col = basis_eval_device + tid_x*npts; + const double* db_col = den_basis_prod_device + tid_x*npts; + + den_reg = bf_col[ tid_y ] * db_col[ tid_y ]; + + } + + // Warp blocks are stored col major + constexpr auto warp_size = cuda::warp_size; + //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; + den_reg = cuda::warp_reduce_sum( den_reg ); + + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( den_eval_device + tid_y, den_reg ); + } + +} + +__global__ void eval_uvars_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + // eval_vvars populated uvar storage already in the case of LDA+RKS + return; +} +__global__ void eval_tmat_lda_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* v2rho2_device = task.v2rho2; + const auto* weight_device = task.weights; + auto* tden_s_eval_device = task.tden_s; + auto* FXC_A_device = task.FXC_A_s; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + if( tid < npts ) { + FXC_A_device[ tid ] = v2rho2_device[ tid ] * tden_s_eval_device[ tid ] * weight_device[ tid ]; + } + + return; +} + + +__global__ void eval_uvars_lda_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + + auto* den_pos_eval_device = task.den_s; + auto* den_neg_eval_device = task.den_z; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const auto ps = den_pos_eval_device[ tid ]; + const auto pz = den_neg_eval_device[ tid ]; + den_pos_eval_device[ tid ] = 0.5*(ps + pz); + den_neg_eval_device[ tid ] = 0.5*(ps - pz); + } +} + +__global__ void eval_tmat_lda_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + + auto* tden_s_device = task.tden_s; + auto* tden_z_device = task.tden_z; + auto* FXC_A_s_device = task.FXC_A_s; + auto* FXC_A_z_device = task.FXC_A_z; + const auto* weight_device = task.weights; + + const auto* v2rho2_a_a_device = task.v2rho2_a_a; + const auto* v2rho2_a_b_device = task.v2rho2_a_b; + const auto* v2rho2_b_b_device = task.v2rho2_b_b; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const auto ps = tden_s_device[ tid ]; + const auto pz = tden_z_device[ tid ]; + const auto trho_a_device = 0.5*(ps + pz); + const auto trho_b_device = 0.5*(ps - pz); + const auto A_a = v2rho2_a_a_device[tid] * trho_a_device + v2rho2_a_b_device[tid] * trho_b_device; + const auto A_b = v2rho2_b_b_device[tid] * trho_b_device + v2rho2_a_b_device[tid] * trho_a_device; + FXC_A_s_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a + A_b); + FXC_A_z_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a - A_b); + } +} + +__global__ void eval_uvars_lda_gks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + + auto* den_z_eval_device = task.den_s; + auto* den_s_eval_device = task.den_z; + auto* den_y_eval_device = task.den_y; + auto* den_x_eval_device = task.den_x; + auto* K_z_eval_device = task.K_z; + auto* K_y_eval_device = task.K_y; + auto* K_x_eval_device = task.K_x; + const double dtolsq = 1e-24; // TODO: make variable + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + + if( tid < npts ) { + const auto ps = den_s_eval_device[ tid ]; + const auto pz = den_z_eval_device[ tid ]; + const auto py = den_y_eval_device[ tid ]; + const auto px = den_x_eval_device[ tid ]; + const auto mtemp = pz*pz + px*px + py*py; + double mnorm = 0.; + + if (mtemp > dtolsq) { + const double inv_mnorm = rsqrt(mtemp); + mnorm = 1./inv_mnorm; + K_z_eval_device[ tid ] = pz * inv_mnorm; + K_y_eval_device[ tid ] = py * inv_mnorm; + K_x_eval_device[ tid ] = px * inv_mnorm; + } + else { + mnorm = (1. / 3.) * (px + py + pz); + K_z_eval_device[ tid ] = 1. / 3.; + K_y_eval_device[ tid ] = 1. / 3.; + K_x_eval_device[ tid ] = 1. / 3.; + } + + den_s_eval_device[ tid ] = 0.5*(ps + mnorm); + den_z_eval_device[ tid ] = 0.5*(ps - mnorm); + + } +} + +} diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_mgga.hpp b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_mgga.hpp new file mode 100644 index 00000000..82b5207e --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars_mgga.hpp @@ -0,0 +1,455 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "device_specific/cuda_device_constants.hpp" +#include "device_specific/cuda_util.hpp" +#include "device/xc_device_data.hpp" + +#define MGGA_KERNEL_SM_BLOCK 32 + +namespace GauXC { + + + +template +__global__ void eval_vvar_mgga_kern( size_t ntasks, + XCDeviceTask* tasks_device) { + + constexpr auto warp_size = cuda::warp_size; + //constexpr auto max_warps_per_thread_block = cuda::max_warps_per_thread_block; + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + double* tau_eval_device = nullptr; + double* lapl_eval_device = nullptr; + + if constexpr (trial){ + if constexpr (den_select == DEN_S) { + tau_eval_device = task.ttau_s; + if constexpr (need_lapl) { + lapl_eval_device = task.tlapl_s; + } + } + if constexpr (den_select == DEN_Z) { + tau_eval_device = task.ttau_z; + if constexpr (need_lapl) { + lapl_eval_device = task.tlapl_z; + } + } + } else{ + if constexpr (den_select == DEN_S) { + tau_eval_device = task.tau_s; + if constexpr (need_lapl) { + lapl_eval_device = task.lapl_s; + } + } + if constexpr (den_select == DEN_Z) { + tau_eval_device = task.tau_z; + if constexpr (need_lapl) { + lapl_eval_device = task.lapl_z; + } + } + } + + //const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr; + if constexpr (need_lapl) { + basis_lapl_eval_device = task.d2bflapl; + } + + //const auto* den_basis_prod_device = task.zmat; + const auto* den_basis_dx_prod_device = task.xmat_x; + const auto* den_basis_dy_prod_device = task.xmat_y; + const auto* den_basis_dz_prod_device = task.xmat_z; + decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr; + if constexpr (need_lapl) { + den_basis_prod_device = task.zmat; + } + + __shared__ double den_shared[3+!!need_lapl][warp_size][MGGA_KERNEL_SM_BLOCK+1]; + + for ( int bid_x = blockIdx.x * blockDim.x; + bid_x < nbf; + bid_x += blockDim.x * gridDim.x ) { + + for ( int bid_y = blockIdx.y * MGGA_KERNEL_SM_BLOCK; + bid_y < npts; + bid_y += MGGA_KERNEL_SM_BLOCK * gridDim.y ) { + + for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + den_shared[0][threadIdx.x][sm_y] = 0.; + den_shared[1][threadIdx.x][sm_y] = 0.; + den_shared[2][threadIdx.x][sm_y] = 0.; + if constexpr (need_lapl) + den_shared[3][threadIdx.x][sm_y] = 0.; + + if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { + const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts; + const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts; + const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts; + + const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts; + const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts; + const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts; + + + den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ]; + den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ]; + den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ]; + + + if constexpr (need_lapl) { + const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts; + const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts; + den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ]; + } + } + } + __syncthreads(); + + + for (int sm_y = threadIdx.y; sm_y < MGGA_KERNEL_SM_BLOCK; sm_y += blockDim.y) { + const int tid_y = bid_y + sm_y; + + register double tx_reg = den_shared[0][sm_y][threadIdx.x]; + register double ty_reg = den_shared[1][sm_y][threadIdx.x]; + register double tz_reg = den_shared[2][sm_y][threadIdx.x]; + // Warp blocks are stored col major + register double tau_reg = 0.0; + tau_reg = 0.5 * cuda::warp_reduce_sum( tx_reg ); + tau_reg += 0.5 * cuda::warp_reduce_sum( ty_reg ); + tau_reg += 0.5 * cuda::warp_reduce_sum( tz_reg ); + + register double lapl_reg = 0.0; + if constexpr (need_lapl) { + lapl_reg = den_shared[3][sm_y][threadIdx.x]; + lapl_reg = cuda::warp_reduce_sum(lapl_reg); + lapl_reg = 2. * lapl_reg + 4. * tau_reg; + } + + if( threadIdx.x == 0 and tid_y < npts ) { + atomicAdd( tau_eval_device + tid_y, tau_reg ); + if constexpr (need_lapl) { + atomicAdd( lapl_eval_device + tid_y, lapl_reg ); + } + } + } + __syncthreads(); + } + } +} + + + + +template +__global__ void eval_uvars_mgga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + auto* tau_pos_eval_device = task.tau_s; + auto* tau_neg_eval_device = task.tau_z; + + double* lapl_pos_eval_device = nullptr; + double* lapl_neg_eval_device = nullptr; + if constexpr (need_lapl) { + lapl_pos_eval_device = task.lapl_s; + lapl_neg_eval_device = task.lapl_z; + } + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const double ts = tau_pos_eval_device[ tid ]; + const double tz = tau_neg_eval_device[ tid ]; + tau_pos_eval_device[ tid ] = 0.5*(ts + tz); + tau_neg_eval_device[ tid ] = 0.5*(ts - tz); + + if constexpr (need_lapl) { + const double ls = lapl_pos_eval_device[ tid ]; + const double lz = lapl_neg_eval_device[ tid ]; + lapl_pos_eval_device[ tid ] = 0.5*(ls + lz); + lapl_neg_eval_device[ tid ] = 0.5*(ls - lz); + } + } + +} + + +__global__ void eval_tmat_mgga_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* dden_sx_eval_device = task.dden_sx; + const auto* dden_sy_eval_device = task.dden_sy; + const auto* dden_sz_eval_device = task.dden_sz; + const auto* tdden_sx_eval_device = task.tdden_sx; + const auto* tdden_sy_eval_device = task.tdden_sy; + const auto* tdden_sz_eval_device = task.tdden_sz; + + const auto* weight_device = task.weights; + const auto* vgamma_device = task.vgamma; + const auto* v2rho2_device = task.v2rho2; + const auto* v2rhogamma_device = task.v2rhogamma; + const auto* v2gamma2_device = task.v2gamma2; + const auto* v2rhotau_device = task.v2rhotau; + const auto* v2tau2_device = task.v2tau2; + const auto* v2gammatau_device = task.v2gammatau; + const auto* trho_device = task.tden_s; + const auto* ttau_device = task.ttau_s; + + auto* FXC_A_device = task.FXC_A_s; + auto* FXC_Bx_device = task.FXC_Bx_s; + auto* FXC_By_device = task.FXC_By_s; + auto* FXC_Bz_device = task.FXC_Bz_s; + auto* FXC_C_device = task.FXC_C_s; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + if( tid < npts ) { + const auto dx = dden_sx_eval_device[ tid ]; + const auto dy = dden_sy_eval_device[ tid ]; + const auto dz = dden_sz_eval_device[ tid ]; + const auto tdx = tdden_sx_eval_device[ tid ]; + const auto tdy = tdden_sy_eval_device[ tid ]; + const auto tdz = tdden_sz_eval_device[ tid ]; + const auto tgamma = tdx*dx + tdy*dy + tdz*dz; + + const auto FXC_A = v2rho2_device[ tid ] * trho_device[ tid ] + 2.0 * v2rhogamma_device[tid] * tgamma + + v2rhotau_device[ tid ] * ttau_device[ tid ]; + FXC_A_device[ tid ] = weight_device[ tid ] * FXC_A; + + const auto FXC_C = v2rhotau_device[ tid ] * trho_device[ tid ] + 2.0 * v2gammatau_device[ tid ] * tgamma + + v2tau2_device[ tid ] * ttau_device[ tid ]; + FXC_C_device[ tid ] = weight_device[ tid ] * FXC_C; + + const auto B_coef = v2rhogamma_device[tid] * trho_device[tid] + 2.0 * v2gamma2_device[tid] * tgamma + + v2gammatau_device[ tid ] * ttau_device[ tid ]; + FXC_Bx_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dx + vgamma_device[ tid ] * tdx ); + FXC_By_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dy + vgamma_device[ tid ] * tdy ); + FXC_Bz_device[ tid ] = 2.0 * weight_device[ tid ] * ( B_coef * dz + vgamma_device[ tid ] * tdz ); + } + +} + + + +__global__ void eval_tmat_mgga_uks_kernel( size_t ntasks, XCDeviceTask* tasks_device) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + const auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + + const auto* tden_s_device = task.tden_s; + const auto* tden_z_device = task.tden_z; + const auto* ttau_s_device = task.ttau_s; + const auto* ttau_z_device = task.ttau_z; + const auto* weight_device = task.weights; + + const auto* tden_pos_x_eval_device = task.tdden_sx; + const auto* tden_pos_y_eval_device = task.tdden_sy; + const auto* tden_pos_z_eval_device = task.tdden_sz; + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + + const auto* tden_neg_x_eval_device = task.tdden_zx; + const auto* tden_neg_y_eval_device = task.tdden_zy; + const auto* tden_neg_z_eval_device = task.tdden_zz; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + const double* vgamma_aa_device = task.vgamma_pp; + const double* vgamma_ab_device = task.vgamma_pm; + const double* vgamma_bb_device = task.vgamma_mm; + const double* v2rho2_a_a_device = task.v2rho2_a_a; + const double* v2rho2_a_b_device = task.v2rho2_a_b; + const double* v2rho2_b_b_device = task.v2rho2_b_b; + const double* v2rhogamma_a_aa_device = task.v2rhogamma_a_aa; + const double* v2rhogamma_a_ab_device = task.v2rhogamma_a_ab; + const double* v2rhogamma_a_bb_device = task.v2rhogamma_a_bb; + const double* v2rhogamma_b_aa_device = task.v2rhogamma_b_aa; + const double* v2rhogamma_b_ab_device = task.v2rhogamma_b_ab; + const double* v2rhogamma_b_bb_device = task.v2rhogamma_b_bb; + const double* v2gamma2_aa_aa_device = task.v2gamma2_aa_aa; + const double* v2gamma2_aa_ab_device = task.v2gamma2_aa_ab; + const double* v2gamma2_aa_bb_device = task.v2gamma2_aa_bb; + const double* v2gamma2_ab_ab_device = task.v2gamma2_ab_ab; + const double* v2gamma2_ab_bb_device = task.v2gamma2_ab_bb; + const double* v2gamma2_bb_bb_device = task.v2gamma2_bb_bb; + const double* v2rhotau_a_a_device = task.v2rhotau_a_a; + const double* v2rhotau_a_b_device = task.v2rhotau_a_b; + const double* v2rhotau_b_a_device = task.v2rhotau_b_a; + const double* v2rhotau_b_b_device = task.v2rhotau_b_b; + const double* v2gammatau_aa_a_device= task.v2gammatau_aa_a; + const double* v2gammatau_aa_b_device= task.v2gammatau_aa_b; + const double* v2gammatau_ab_a_device= task.v2gammatau_ab_a; + const double* v2gammatau_ab_b_device= task.v2gammatau_ab_b; + const double* v2gammatau_bb_a_device= task.v2gammatau_bb_a; + const double* v2gammatau_bb_b_device= task.v2gammatau_bb_b; + const double* v2tau2_a_a_device = task.v2tau2_a_a; + const double* v2tau2_a_b_device = task.v2tau2_a_b; + const double* v2tau2_b_b_device = task.v2tau2_b_b; + + auto* FXC_A_s_device = task.FXC_A_s; + auto* FXC_A_z_device = task.FXC_A_z; + auto* FXC_Bx_s_device = task.FXC_Bx_s; + auto* FXC_Bx_z_device = task.FXC_Bx_z; + auto* FXC_By_s_device = task.FXC_By_s; + auto* FXC_By_z_device = task.FXC_By_z; + auto* FXC_Bz_s_device = task.FXC_Bz_s; + auto* FXC_Bz_z_device = task.FXC_Bz_z; + auto* FXC_C_s_device = task.FXC_C_s; + auto* FXC_C_z_device = task.FXC_C_z; + + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + + if( tid < npts ) { + const auto ps = tden_s_device[ tid ]; + const auto pz = tden_z_device[ tid ]; + const auto trho_a_device = 0.5*(ps + pz); + const auto trho_b_device = 0.5*(ps - pz); + const auto ts = ttau_s_device[ tid ]; + const auto tz = ttau_z_device[ tid ]; + const auto tau_a = 0.5*(ts + tz); + const auto tau_b = 0.5*(ts - tz); + + const auto tdndx = tden_pos_x_eval_device[ tid ]; + const auto tdndy = tden_pos_y_eval_device[ tid ]; + const auto tdndz = tden_pos_z_eval_device[ tid ]; + const auto tdMzdx = tden_neg_x_eval_device[ tid ]; + const auto tdMzdy = tden_neg_y_eval_device[ tid ]; + const auto tdMzdz = tden_neg_z_eval_device[ tid ]; + const auto tdden_a_x = 0.5*(tdndx + tdMzdx); + const auto tdden_a_y = 0.5*(tdndy + tdMzdy); + const auto tdden_a_z = 0.5*(tdndz + tdMzdz); + const auto tdden_b_x = 0.5*(tdndx - tdMzdx); + const auto tdden_b_y = 0.5*(tdndy - tdMzdy); + const auto tdden_b_z = 0.5*(tdndz - tdMzdz); + + const auto dndx = den_pos_x_eval_device[ tid ]; + const auto dndy = den_pos_y_eval_device[ tid ]; + const auto dndz = den_pos_z_eval_device[ tid ]; + const auto dMzdx = den_neg_x_eval_device[ tid ]; + const auto dMzdy = den_neg_y_eval_device[ tid ]; + const auto dMzdz = den_neg_z_eval_device[ tid ]; + const auto dden_a_x = 0.5*(dndx + dMzdx); + const auto dden_a_y = 0.5*(dndy + dMzdy); + const auto dden_a_z = 0.5*(dndz + dMzdz); + const auto dden_b_x = 0.5*(dndx - dMzdx); + const auto dden_b_y = 0.5*(dndy - dMzdy); + const auto dden_b_z = 0.5*(dndz - dMzdz); + + const auto tgamma_pp = tdden_a_x * dden_a_x + tdden_a_y * dden_a_y + tdden_a_z * dden_a_z; + const auto tgamma_pm = tdden_a_x * dden_b_x + tdden_a_y * dden_b_y + tdden_a_z * dden_b_z + + tdden_b_x * dden_a_x + tdden_b_y * dden_a_y + tdden_b_z * dden_a_z; + const auto tgamma_mm = tdden_b_x * dden_b_x + tdden_b_y * dden_b_y + tdden_b_z * dden_b_z; + + + const auto A_a = v2rho2_a_a_device[tid] * trho_a_device + 2.0 * v2rhogamma_a_aa_device[tid] * tgamma_pp + + v2rhogamma_a_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_a_bb_device[tid] * tgamma_mm + + v2rho2_a_b_device[tid] * trho_b_device + v2rhotau_a_a_device[tid] * tau_a + + v2rhotau_a_b_device[tid] * tau_b; + const auto A_b = v2rho2_b_b_device[tid] * trho_b_device + 2.0 * v2rhogamma_b_bb_device[tid] * tgamma_mm + + v2rhogamma_b_ab_device[tid] * tgamma_pm + 2.0 * v2rhogamma_b_aa_device[tid] * tgamma_pp + + v2rho2_a_b_device[tid] * trho_a_device + v2rhotau_b_b_device[tid] * tau_b + + v2rhotau_b_a_device[tid] * tau_a; + FXC_A_s_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a + A_b); + FXC_A_z_device[ tid ] = 0.5 * weight_device[ tid ] * (A_a - A_b); + + // Compute C coefficients for alpha and beta spin + const auto C_a = v2rhotau_a_a_device[tid] * trho_a_device + v2rhotau_b_a_device[tid] * trho_b_device + + 2.0 * v2gammatau_aa_a_device[tid] * tgamma_pp + v2gammatau_ab_a_device[tid] * tgamma_pm + + 2.0 * v2gammatau_bb_a_device[tid] * tgamma_mm + + v2tau2_a_a_device[tid] * tau_a + v2tau2_a_b_device[tid] * tau_b; + + const auto C_b = v2rhotau_a_b_device[tid] * trho_a_device + v2rhotau_b_b_device[tid] * trho_b_device + + 2.0 * v2gammatau_aa_b_device[tid] * tgamma_pp + v2gammatau_ab_b_device[tid] * tgamma_pm + + 2.0 * v2gammatau_bb_b_device[tid] * tgamma_mm + + v2tau2_a_b_device[tid] * tau_a + v2tau2_b_b_device[tid] * tau_b; + + FXC_C_s_device[tid] = 0.5 * weight_device[tid] * (C_a + C_b); + FXC_C_z_device[tid] = 0.5 * weight_device[tid] * (C_a - C_b); + + // Calculate B coefficients for alpha spin + const double B_coef1_a = v2rhogamma_a_aa_device[tid] * trho_a_device + 2.0 * v2gamma2_aa_aa_device[tid] * tgamma_pp + + v2gamma2_aa_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_mm + + v2rhogamma_b_aa_device[tid] * trho_b_device + v2gammatau_aa_a_device[tid] * tau_a + + v2gammatau_aa_b_device[tid] * tau_b; + + const double B_coef2_a = v2rhogamma_a_ab_device[tid] * trho_a_device + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + + v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm + + v2rhogamma_b_ab_device[tid] * trho_b_device + v2gammatau_ab_a_device[tid] * tau_a + + v2gammatau_ab_b_device[tid] * tau_b; + + // Calculate gradient components for alpha spin + const double Bx_a = 2.0 * B_coef1_a * dden_a_x + B_coef2_a * dden_b_x + + 2.0 * vgamma_aa_device[tid] * tdden_a_x + vgamma_ab_device[tid] * tdden_b_x; + + const double By_a = 2.0 * B_coef1_a * dden_a_y + B_coef2_a * dden_b_y + + 2.0 * vgamma_aa_device[tid] * tdden_a_y + vgamma_ab_device[tid] * tdden_b_y; + + const double Bz_a = 2.0 * B_coef1_a * dden_a_z + B_coef2_a * dden_b_z + + 2.0 * vgamma_aa_device[tid] * tdden_a_z + vgamma_ab_device[tid] * tdden_b_z; + + // Calculate B coefficients for beta spin + const double B_coef1_b = v2rhogamma_b_bb_device[tid] * trho_b_device + 2.0 * v2gamma2_bb_bb_device[tid] * tgamma_mm + + v2gamma2_ab_bb_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_bb_device[tid] * tgamma_pp + + v2rhogamma_a_bb_device[tid] * trho_a_device + v2gammatau_bb_b_device[tid] * tau_b + + v2gammatau_bb_a_device[tid] * tau_a; + + const double B_coef2_b = v2rhogamma_b_ab_device[tid] * trho_b_device + 2.0 * v2gamma2_ab_bb_device[tid] * tgamma_mm + + v2gamma2_ab_ab_device[tid] * tgamma_pm + 2.0 * v2gamma2_aa_ab_device[tid] * tgamma_pp + + v2rhogamma_a_ab_device[tid] * trho_a_device + v2gammatau_ab_b_device[tid] * tau_b + + v2gammatau_ab_a_device[tid] * tau_a; + + const double Bx_b = 2.0 * B_coef1_b * dden_b_x + B_coef2_b * dden_a_x + + 2.0 * vgamma_bb_device[tid] * tdden_b_x + vgamma_ab_device[tid] * tdden_a_x; + + const double By_b = 2.0 * B_coef1_b * dden_b_y + B_coef2_b * dden_a_y + + 2.0 * vgamma_bb_device[tid] * tdden_b_y + vgamma_ab_device[tid] * tdden_a_y; + + const double Bz_b = 2.0 * B_coef1_b * dden_b_z + B_coef2_b * dden_a_z + + 2.0 * vgamma_bb_device[tid] * tdden_b_z + vgamma_ab_device[tid] * tdden_a_z; + + // Store weighted values in output arrays + FXC_Bx_s_device[tid] = 0.5 * weight_device[tid] * (Bx_a + Bx_b); + FXC_By_s_device[tid] = 0.5 * weight_device[tid] * (By_a + By_b); + FXC_Bz_s_device[tid] = 0.5 * weight_device[tid] * (Bz_a + Bz_b); + FXC_Bx_z_device[tid] = 0.5 * weight_device[tid] * (Bx_a - Bx_b); + FXC_By_z_device[tid] = 0.5 * weight_device[tid] * (By_a - By_b); + FXC_Bz_z_device[tid] = 0.5 * weight_device[tid] * (Bz_a - Bz_b); + + } + +} + + +} diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_fxc.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_fxc.cu new file mode 100644 index 00000000..36ba9a16 --- /dev/null +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_fxc.cu @@ -0,0 +1,238 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include "device/common/zmat_fxc.hpp" +#include +#include "device_specific/cuda_util.hpp" +#include "device_specific/cuda_device_constants.hpp" + +namespace GauXC { + + +template +__global__ void zmat_lda_fxc_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const auto* FXC_A_device = task.FXC_A_s; + if constexpr ( den_selector == DEN_Z ) FXC_A_device = task.FXC_A_z; + + const auto* basis_eval_device = task.bf; + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + const double fact = 0.5 * FXC_A_device[tid_x]; + + z_matrix_device[ ibfoff ] = fact * basis_eval_device[ ibfoff ]; + } + +} + + + + + +template +__global__ void zmat_gga_fxc_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + const auto* FXC_A_device = task.FXC_A_s; + const auto* FXC_Bx_device = task.FXC_Bx_s; + const auto* FXC_By_device = task.FXC_By_s; + const auto* FXC_Bz_device = task.FXC_Bz_s; + if constexpr ( den_selector == DEN_Z ) { + FXC_A_device = task.FXC_A_z; + FXC_Bx_device = task.FXC_Bx_z; + FXC_By_device = task.FXC_By_z; + FXC_Bz_device = task.FXC_Bz_z; + } + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double dx = FXC_Bx_device[tid_x] * dbasis_x_eval_device[ ibfoff ]; + const double dy = FXC_By_device[tid_x] * dbasis_y_eval_device[ ibfoff ]; + const double dz = FXC_Bz_device[tid_x] * dbasis_z_eval_device[ ibfoff ]; + + z_matrix_device[ ibfoff ] = + (0.5 * FXC_A_device[tid_x] * basis_eval_device[ ibfoff ] + dx + dy + dz ); + } +} + + + +#define ZMAT_FXC_KERN(xc_approx) \ + cudaStream_t stream = queue.queue_as(); \ + dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); \ + dim3 blocks( util::div_ceil( max_npts, threads.x ), \ + util::div_ceil( max_nbf, threads.y ), \ + ntasks ); \ + if ( sel == DEN_S ) zmat_##xc_approx##_fxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + else if ( sel == DEN_Z ) zmat_##xc_approx##_fxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); \ + + + +void zmat_lda_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + density_id sel, + device_queue queue ) { +ZMAT_FXC_KERN(lda) +} + + + +void zmat_gga_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + density_id sel, + device_queue queue ) { +ZMAT_FXC_KERN(gga) +} + + + +void zmat_mgga_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + density_id sel, + device_queue queue ) { + + cudaStream_t stream = queue.queue_as() ; + + + dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); + dim3 blocks( util::div_ceil( max_npts, threads.x ), + util::div_ceil( max_nbf, threads.y ), + ntasks ); + + if(do_lapl) + GAUXC_GENERIC_EXCEPTION("Fxc contraction + do_lapl NYI"); + + switch(sel) { + case DEN_S: + zmat_gga_fxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + case DEN_Z: + zmat_gga_fxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + } + +} + + + + + + + + + + +template +__global__ void mmat_mgga_fxc_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + auto* FXC_C_s_device = task.FXC_C_s; + if constexpr ( id == DEN_Z ) FXC_C_s_device = task.FXC_C_z; + + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* mmat_x = task.xmat_x; + auto* mmat_y = task.xmat_y; + auto* mmat_z = task.xmat_z; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double fact = 0.25 * FXC_C_s_device[tid_x]; + + mmat_x[ ibfoff ] = fact * dbasis_x_eval_device[ ibfoff ]; + mmat_y[ ibfoff ] = fact * dbasis_y_eval_device[ ibfoff ]; + mmat_z[ ibfoff ] = fact * dbasis_z_eval_device[ ibfoff ]; + } +} + +void mmat_mgga_fxc( size_t ntasks, + int32_t max_nbf, + int32_t max_npts, + XCDeviceTask* tasks_device, + bool do_lapl, + density_id sel, + device_queue queue ) { + + cudaStream_t stream = queue.queue_as() ; + + + dim3 threads(cuda::warp_size,cuda::max_warps_per_thread_block,1); + dim3 blocks( util::div_ceil( max_npts, threads.x ), + util::div_ceil( max_nbf, threads.y ), + ntasks ); + + if(do_lapl) + GAUXC_GENERIC_EXCEPTION("Fxc contraction + do_lapl NYI"); + + switch(sel) { + case DEN_S: + mmat_mgga_fxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + case DEN_Z: + mmat_mgga_fxc_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + } + +} + +} + diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu index 616e0bd5..2e695785 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/kernels/zmat_vxc.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -393,6 +397,9 @@ __global__ void zmat_gga_vxc_gks_kernel( size_t ntasks, } } + + + template __global__ void zmat_mgga_vxc_rks_kernel( size_t ntasks, XCDeviceTask* tasks_device ) { @@ -444,6 +451,91 @@ __global__ void zmat_mgga_vxc_rks_kernel( size_t ntasks, } } +template +__global__ void zmat_mgga_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + + const double* vrho_pos_device = task.vrho_pos; + const double* vrho_neg_device = task.vrho_neg; + const double* vlapl_pos_device = task.vlapl_pos; + const double* vlapl_neg_device = task.vlapl_neg; + const double* vgamma_pp_device = task.vgamma_pp; + const double* vgamma_pm_device = task.vgamma_pm; + const double* vgamma_mm_device = task.vgamma_mm; + + const auto* den_pos_x_eval_device = task.dden_sx; + const auto* den_pos_y_eval_device = task.dden_sy; + const auto* den_pos_z_eval_device = task.dden_sz; + const auto* den_neg_x_eval_device = task.dden_zx; + const auto* den_neg_y_eval_device = task.dden_zy; + const auto* den_neg_z_eval_device = task.dden_zz; + + + const auto* basis_eval_device = task.bf; + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + const auto* d2basis_lapl_eval_device = task.d2bflapl; + + auto* z_matrix_device = task.zmat; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + const size_t ibfoff = tid_y * npts + tid_x; + + const double factp = 0.25 * vrho_pos_device[tid_x]; + const double factm = 0.25 * vrho_neg_device[tid_x]; + + const auto gga_fact_pp = vgamma_pp_device[tid_x]; + const auto gga_fact_pm = vgamma_pm_device[tid_x]; + const auto gga_fact_mm = vgamma_mm_device[tid_x]; + + const auto gga_fact_1 = 0.5*(gga_fact_pp + gga_fact_pm + gga_fact_mm); + const auto gga_fact_2 = 0.5*(gga_fact_pp - gga_fact_mm); + const auto gga_fact_3 = 0.5*(gga_fact_pp - gga_fact_pm + gga_fact_mm); + + double sign = 1.0; + + double x_fact, y_fact, z_fact; + + if constexpr ( den_selector == DEN_S ) { + x_fact = gga_fact_1 * den_pos_x_eval_device[ tid_x ] + gga_fact_2 * den_neg_x_eval_device[ tid_x ]; + y_fact = gga_fact_1 * den_pos_y_eval_device[ tid_x ] + gga_fact_2 * den_neg_y_eval_device[ tid_x ]; + z_fact = gga_fact_1 * den_pos_z_eval_device[ tid_x ] + gga_fact_2 * den_neg_z_eval_device[ tid_x ]; + } + if constexpr ( den_selector == DEN_Z ) { + sign = -1.0; + x_fact = gga_fact_3 * den_neg_x_eval_device[ tid_x ] + gga_fact_2 * den_pos_x_eval_device[ tid_x ]; + y_fact = gga_fact_3 * den_neg_y_eval_device[ tid_x ] + gga_fact_2 * den_pos_y_eval_device[ tid_x ]; + z_fact = gga_fact_3 * den_neg_z_eval_device[ tid_x ] + gga_fact_2 * den_pos_z_eval_device[ tid_x ]; + } + + auto val = x_fact * dbasis_x_eval_device[ ibfoff ] + + y_fact * dbasis_y_eval_device[ ibfoff ] + + z_fact * dbasis_z_eval_device[ ibfoff ] + + (factp + sign * factm) * basis_eval_device[ ibfoff ]; + + if constexpr (need_lapl) { + const double lfactp = vlapl_pos_device[tid_x]; + const double lfactm = vlapl_neg_device[tid_x]; + + val += 0.5 * (lfactp + sign * lfactm) * d2basis_lapl_eval_device[ ibfoff ]; + } + + z_matrix_device[ ibfoff ] = val; + } +} + #define ZMAT_VXC_KERN(xc_approx) \ @@ -503,6 +595,8 @@ void zmat_mgga_vxc( size_t ntasks, int32_t max_npts, XCDeviceTask* tasks_device, bool do_lapl, + integrator_ks_scheme scheme, + density_id sel, device_queue queue ) { cudaStream_t stream = queue.queue_as() ; @@ -513,10 +607,29 @@ void zmat_mgga_vxc( size_t ntasks, util::div_ceil( max_nbf, threads.y ), ntasks ); - if(do_lapl) - zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - else - zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + if(scheme == RKS) { + if(do_lapl) + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + zmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + } else if(scheme == UKS) { + switch(sel) { + case DEN_S: + if(do_lapl) + zmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + zmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + case DEN_Z: + if(do_lapl) + zmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + zmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + } + } else { + GAUXC_GENERIC_EXCEPTION("MGGA + DEVICE + GKS NYI"); + } } @@ -571,6 +684,55 @@ __global__ void mmat_mgga_vxc_rks_kernel( size_t ntasks, } } +template +__global__ void mmat_mgga_vxc_uks_kernel( size_t ntasks, + XCDeviceTask* tasks_device ) { + + const int batch_idx = blockIdx.z; + if( batch_idx >= ntasks ) return; + + auto& task = tasks_device[ batch_idx ]; + const auto npts = task.npts; + const auto nbf = task.bfn_screening.nbe; + const auto* vtau_pos_device = task.vtau_pos; + const auto* vtau_neg_device = task.vtau_neg; + const double* vlapl_pos_device = need_lapl ? task.vlapl_pos : nullptr; + const double* vlapl_neg_device = need_lapl ? task.vlapl_neg : nullptr; + + const auto* dbasis_x_eval_device = task.dbfx; + const auto* dbasis_y_eval_device = task.dbfy; + const auto* dbasis_z_eval_device = task.dbfz; + + auto* mmat_x = task.xmat_x; + auto* mmat_y = task.xmat_y; + auto* mmat_z = task.xmat_z; + + const int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + const int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + + if( tid_x < npts and tid_y < nbf ) { + + double sign = 1.0; + if(id == DEN_Z) sign = -1; + + const size_t ibfoff = tid_y * npts + tid_x; + const auto tfactp = 0.25 * vtau_pos_device[tid_x]; + const auto tfactm = 0.25 * vtau_neg_device[tid_x]; + const double fact_tau = 0.5 * (tfactp + sign * tfactm); + double fact_lapl = 0.0; + if(need_lapl) { + const auto lfactp = vlapl_pos_device[tid_x]; + const auto lfactm = vlapl_neg_device[tid_x]; + fact_lapl = 0.5 * (lfactp + sign * lfactm); + } + const double fact_1 = fact_tau + fact_lapl; + + mmat_x[ ibfoff ] = fact_1 * dbasis_x_eval_device[ ibfoff ]; + mmat_y[ ibfoff ] = fact_1 * dbasis_y_eval_device[ ibfoff ]; + mmat_z[ ibfoff ] = fact_1 * dbasis_z_eval_device[ ibfoff ]; + } +} + //__global__ void print_zmat_stats( size_t ntasks, // XCDeviceTask* tasks_device) { // @@ -597,7 +759,7 @@ __global__ void mmat_mgga_vxc_rks_kernel( size_t ntasks, // const auto* vrho = task.vrho; // const auto* gamma = task.gamma; // const auto* tau = task.tau; -// const auto* lapl = task.denlapl; +// const auto* lapl = task.lapl; // const auto* rho = task.den; // double enrm = 0.0, gnrm = 0.0, tnrm = 0.0, rnrm = 0.0, lnrm = 0.0; // double vgnrm = 0.0, vtnrm = 0.0, vrnrm = 0.0, vlnrm = 0.0; @@ -625,6 +787,8 @@ void mmat_mgga_vxc( size_t ntasks, int32_t max_npts, XCDeviceTask* tasks_device, bool do_lapl, + integrator_ks_scheme scheme, + density_id sel, device_queue queue ) { cudaStream_t stream = queue.queue_as() ; @@ -635,10 +799,30 @@ void mmat_mgga_vxc( size_t ntasks, util::div_ceil( max_nbf, threads.y ), ntasks ); - if(do_lapl) - mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); - else - mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + if(scheme == RKS) { + if(do_lapl) + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + mmat_mgga_vxc_rks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + } else if(scheme == UKS) { + switch(sel) { + case DEN_S: + if(do_lapl) + mmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + mmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + case DEN_Z: + if(do_lapl) + mmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + else + mmat_mgga_vxc_uks_kernel<<< blocks, threads, 0, stream >>>( ntasks, tasks_device ); + break; + } + } else { + GAUXC_GENERIC_EXCEPTION("MGGA + DEVICE + GKS NYI"); + } + //print_zmat_stats<<<1,1,0,stream>>>(ntasks,tasks_device); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt index 5f18e92f..6bdf66a7 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp index 8726eba1..110f76d0 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/chebyshev_boys_computation.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp index 552656ea..3a2ca466 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/integral_data_types.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp index 22c554b7..14ea3134 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/include/gpu/obara_saika_integrals.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu index b6385468..e607d086 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/chebyshev_boys_computation.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp index 98f8543b..d8a472f3 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/config_obara_saika.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu index 17a74fb1..23eb95ce 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu index a5db5b61..6779a4ea 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu index 248977a8..ec51003c 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -355,7 +359,7 @@ struct DeviceTask00 { static constexpr bool use_shared = (primpair_shared_limit > 0) && (primpair_shared_limit <= max_primpair_shared_limit); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; @@ -382,8 +386,8 @@ struct DeviceTask00 { double *Gi = param.Gi; double *Gj = param.Gj; - const int laneId = threadIdx.x % cuda::warp_size; - const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size; + const int laneId = threadIdx.x % GauXC::cuda::warp_size; + const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size; __shared__ GauXC::PrimitivePair s_prim_pairs[prim_buffer_size] __attribute__((unused)); @@ -397,7 +401,7 @@ struct DeviceTask00 { for (int i = 0; i < num_warps; i++) { double temp = SCALAR_ZERO(); - const int pointIndex = i * cuda::warp_size + laneId; + const int pointIndex = i * GauXC::cuda::warp_size + laneId; if (pointIndex < npts) { diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu index 02d4d40b..62cd3d53 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_0_0.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu index 334fa3bb..667e851b 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu index bf9a4841..16f8324e 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu index 9dfcad3f..71313b04 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -352,7 +356,8 @@ using namespace GauXC; } -template +template struct DeviceTask10 { static constexpr int max_primpair_shared_limit = 32; @@ -365,7 +370,7 @@ struct DeviceTask10 { static constexpr bool use_shared = (primpair_shared_limit > 0) && (primpair_shared_limit <= max_primpair_shared_limit); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; @@ -393,12 +398,12 @@ struct DeviceTask10 { double *Gj = param.Gj; static constexpr bool use_shared = (primpair_shared_limit > 0); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; - const int laneId = threadIdx.x % cuda::warp_size; - const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size; + const int laneId = threadIdx.x % GauXC::cuda::warp_size; + const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size; __shared__ GauXC::PrimitivePair s_prim_pairs[prim_buffer_size] __attribute__((unused)); @@ -414,7 +419,7 @@ struct DeviceTask10 { double temp_1 = SCALAR_ZERO(); double temp_2 = SCALAR_ZERO(); - const int pointIndex = i * cuda::warp_size + laneId; + const int pointIndex = i * GauXC::cuda::warp_size + laneId; if (pointIndex < npts) { const double point_x = s_task_data[pointIndex].x; @@ -490,31 +495,46 @@ struct DeviceTask10 { SCALAR_TYPE const_value_w; SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2; + SCALAR_TYPE Xik_0, Xik_1, Xik_2; + + if constexpr (pure_bra) { + Xik_0 = SCALAR_LOAD((Xik + 2*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 1*ldX)); + } else { + Xik_0 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 1*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 2*ldX)); + } + X_ABp = 1.0; comb_m_i = 1.0; Y_ABp = 1.0; comb_n_j = 1.0; Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; ty = SCALAR_LOAD((Xjk + 0 * ldX)); t0 = SCALAR_MUL(temp_0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - atomicAdd((Gik + 0 * ldG), tz); + if constexpr (pure_bra) atomicAdd((Gik + 2 * ldG), tz); + else atomicAdd((Gik + 0 * ldG), tz); - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - atomicAdd((Gik + 1 * ldG), tz); + if constexpr (pure_bra) atomicAdd((Gik + 0 * ldG), tz); + else atomicAdd((Gik + 1 * ldG), tz); - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - atomicAdd((Gik + 2 * ldG), tz); + if constexpr (pure_bra) atomicAdd((Gik + 1 * ldG), tz); + else atomicAdd((Gik + 2 * ldG), tz); atomicAdd((Gjk + 0 * ldG), tw); } @@ -525,15 +545,28 @@ struct DeviceTask10 { }; template -using AM10_swap = DeviceTask10; +using AM10_swap_cart = DeviceTask10; + +template +using AM10_cart = DeviceTask10; + +template +using AM10_swap_sph = DeviceTask10; template -using AM10 = DeviceTask10; +using AM10_sph = DeviceTask10; void integral_1_0_task_batched( bool swap, + bool sph, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -554,21 +587,39 @@ using AM10 = DeviceTask10( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } else { - dev_integral_task_map_dispatcher( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu index 3851a865..21273e23 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_0.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -33,6 +37,7 @@ namespace XGPU { void integral_1_0_task_batched( bool swap, + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu index f60591c4..fae49afc 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -780,7 +784,8 @@ using namespace GauXC; } -template +template struct DeviceTask11 { static constexpr int max_primpair_shared_limit = 32; @@ -794,7 +799,7 @@ struct DeviceTask11 { static constexpr bool use_shared = (primpair_shared_limit > 0) && (primpair_shared_limit <= max_primpair_shared_limit); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; @@ -825,12 +830,12 @@ struct DeviceTask11 { const double Z_AB = param.Z_AB; static constexpr bool use_shared = (primpair_shared_limit > 0); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; - const int laneId = threadIdx.x % cuda::warp_size; - const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size; + const int laneId = threadIdx.x % GauXC::cuda::warp_size; + const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size; __shared__ GauXC::PrimitivePair s_prim_pairs[prim_buffer_size] __attribute__((unused)); @@ -859,7 +864,7 @@ struct DeviceTask11 { temp_7 = SCALAR_ZERO(); temp_8 = SCALAR_ZERO(); - const int pointIndex = i * cuda::warp_size + laneId; + const int pointIndex = i * GauXC::cuda::warp_size + laneId; if (pointIndex < npts) { const double point_x = s_task_data[pointIndex].x; @@ -990,6 +995,34 @@ struct DeviceTask11 { SCALAR_TYPE const_value_w; SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2; + SCALAR_TYPE Xik_0, Xik_1, Xik_2; + SCALAR_TYPE Xjk_0, Xjk_1, Xjk_2; + SCALAR_TYPE Gjk_0, Gjk_1, Gjk_2; + + if constexpr (pure_bra) { + Xik_0 = SCALAR_LOAD((Xik + 2*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 1*ldX)); + } else { + Xik_0 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 1*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 2*ldX)); + } + + if constexpr (pure_ket) { + Xjk_0 = SCALAR_LOAD((Xjk + 2*ldX)); + Xjk_1 = SCALAR_LOAD((Xjk + 0*ldX)); + Xjk_2 = SCALAR_LOAD((Xjk + 1*ldX)); + } else { + Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX)); + Xjk_1 = SCALAR_LOAD((Xjk + 1*ldX)); + Xjk_2 = SCALAR_LOAD((Xjk + 2*ldX)); + } + + Gjk_0 = 0; + Gjk_1 = 0; + Gjk_2 = 0; + /**** j = 0 ****/ X_ABp = 1.0; comb_m_i = 1.0; Y_ABp = 1.0; comb_n_j = 1.0; @@ -997,20 +1030,20 @@ struct DeviceTask11 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tx = Xik_0; + ty = Xjk_0; t0 = SCALAR_MUL(temp_3, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); outBuffer[threadIdx.x][0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_4, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); outBuffer[threadIdx.x][1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_5, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); @@ -1022,24 +1055,24 @@ struct DeviceTask11 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_MUL(temp_0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); outBuffer[threadIdx.x][0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); outBuffer[threadIdx.x][1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); outBuffer[threadIdx.x][2] += tz; - if constexpr (!diag) atomicAdd((Gjk + 0 * ldG), tw); + if constexpr (!diag) Gjk_0 = tw; /**** j = 1 ****/ X_ABp = 1.0; comb_m_i = 1.0; @@ -1048,20 +1081,20 @@ struct DeviceTask11 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tx = Xik_0; + ty = Xjk_1; t0 = SCALAR_MUL(temp_4, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); outBuffer[threadIdx.x][0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_6, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); outBuffer[threadIdx.x][1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_7, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); @@ -1072,24 +1105,24 @@ struct DeviceTask11 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_MUL(temp_0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); outBuffer[threadIdx.x][0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); outBuffer[threadIdx.x][1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); outBuffer[threadIdx.x][2] += tz; - if constexpr (!diag) atomicAdd((Gjk + 1 * ldG), tw); + if constexpr (!diag) Gjk_1 = tw; /**** j = 2 ****/ X_ABp = 1.0; comb_m_i = 1.0; @@ -1098,20 +1131,20 @@ struct DeviceTask11 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tx = Xik_0; + ty = Xjk_2; t0 = SCALAR_MUL(temp_5, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); outBuffer[threadIdx.x][0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_7, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); outBuffer[threadIdx.x][1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_8, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); @@ -1121,28 +1154,46 @@ struct DeviceTask11 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_MUL(temp_0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); outBuffer[threadIdx.x][0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); outBuffer[threadIdx.x][1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); outBuffer[threadIdx.x][2] += tz; - if constexpr (!diag) atomicAdd((Gjk + 2 * ldG), tw); - - atomicAdd((Gik + 0 * ldG), outBuffer[threadIdx.x][0]); - atomicAdd((Gik + 1 * ldG), outBuffer[threadIdx.x][1]); - atomicAdd((Gik + 2 * ldG), outBuffer[threadIdx.x][2]); + if constexpr (!diag) Gjk_2 = tw; + + if constexpr (!diag) { + if constexpr (pure_ket) { + atomicAdd((Gjk + 2 * ldG), Gjk_0); + atomicAdd((Gjk + 0 * ldG), Gjk_1); + atomicAdd((Gjk + 1 * ldG), Gjk_2); + } else { + atomicAdd((Gjk + 0 * ldG), Gjk_0); + atomicAdd((Gjk + 1 * ldG), Gjk_1); + atomicAdd((Gjk + 2 * ldG), Gjk_2); + } + } + + if constexpr (pure_bra) { + atomicAdd((Gik + 2 * ldG), outBuffer[threadIdx.x][0]); + atomicAdd((Gik + 0 * ldG), outBuffer[threadIdx.x][1]); + atomicAdd((Gik + 1 * ldG), outBuffer[threadIdx.x][2]); + } else { + atomicAdd((Gik + 0 * ldG), outBuffer[threadIdx.x][0]); + atomicAdd((Gik + 1 * ldG), outBuffer[threadIdx.x][1]); + atomicAdd((Gik + 2 * ldG), outBuffer[threadIdx.x][2]); + } } } @@ -1152,14 +1203,26 @@ struct DeviceTask11 { }; template -using AM11 = DeviceTask11; - +using AM11_cart = DeviceTask11; template -using AM1 = DeviceTask11; +using AM1_cart = DeviceTask11; +template +using AM11_sph = DeviceTask11; +template +using AM1_sph = DeviceTask11; + + void integral_1_1_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -1179,16 +1242,26 @@ using AM1 = DeviceTask11( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } void integral_1_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -1208,12 +1281,21 @@ using AM1 = DeviceTask11( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu index f19342e4..222765fd 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_1_1.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -38,6 +42,7 @@ namespace XGPU { cudaStream_t stream); void integral_1_1_task_batched( + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -53,6 +58,7 @@ namespace XGPU { cudaStream_t stream); void integral_1_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpairs, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu index 7c555bd6..e8318b2a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu index 5c250e4b..09e63bd4 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu index 70631f8c..ecbfa6e3 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -450,7 +454,8 @@ using namespace GauXC; nsp, sp2task, device_tasks, boys_table ); } -template +template struct DeviceTask20 { static constexpr int max_primpair_shared_limit = 32; @@ -463,7 +468,7 @@ struct DeviceTask20 { static constexpr bool use_shared = (primpair_shared_limit > 0) && (primpair_shared_limit <= max_primpair_shared_limit); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; @@ -491,12 +496,12 @@ struct DeviceTask20 { double *Gj = param.Gj; static constexpr bool use_shared = (primpair_shared_limit > 0); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; - const int laneId = threadIdx.x % cuda::warp_size; - const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size; + const int laneId = threadIdx.x % GauXC::cuda::warp_size; + const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size; __shared__ GauXC::PrimitivePair s_prim_pairs[prim_buffer_size] __attribute__((unused)); @@ -516,7 +521,7 @@ struct DeviceTask20 { double temp_4 = SCALAR_ZERO(); double temp_5 = SCALAR_ZERO(); - const int pointIndex = i * cuda::warp_size + laneId; + const int pointIndex = i * GauXC::cuda::warp_size + laneId; if (pointIndex < npts) { @@ -649,47 +654,90 @@ struct DeviceTask20 { SCALAR_TYPE const_value_w; SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5; + SCALAR_TYPE Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5; + SCALAR_TYPE Xjk_0; + SCALAR_TYPE Gik_0, Gik_1, Gik_2, Gik_3, Gik_4, Gik_5; + + if constexpr (pure_bra) { + SCALAR_TYPE Xik_m2 = SCALAR_LOAD((Xik + 0*ldX)); + SCALAR_TYPE Xik_m1 = SCALAR_LOAD((Xik + 1*ldX)); + SCALAR_TYPE Xik_z0 = SCALAR_LOAD((Xik + 2*ldX)); + SCALAR_TYPE Xik_p1 = SCALAR_LOAD((Xik + 3*ldX)); + SCALAR_TYPE Xik_p2 = SCALAR_LOAD((Xik + 4*ldX)); + + ::cuda::std::tie(Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5) = + sph::itform_l2(Xik_m2, Xik_m1, Xik_z0, Xik_p1, Xik_p2); + } else { + Xik_0 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 1*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 2*ldX)); + Xik_3 = SCALAR_LOAD((Xik + 3*ldX)); + Xik_4 = SCALAR_LOAD((Xik + 4*ldX)); + Xik_5 = SCALAR_LOAD((Xik + 5*ldX)); + } + + Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX)); + X_ABp = 1.0; comb_m_i = 1.0; Y_ABp = 1.0; comb_n_j = 1.0; Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tx = Xik_0; + ty = Xjk_0; t0 = SCALAR_MUL(temp_0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - atomicAdd((Gik + 0 * ldG), tz); + Gik_0 = tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_MUL(temp_1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - atomicAdd((Gik + 1 * ldG), tz); + Gik_1 = tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_MUL(temp_2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - atomicAdd((Gik + 2 * ldG), tz); + Gik_2 = tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_MUL(temp_3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - atomicAdd((Gik + 3 * ldG), tz); + Gik_3 = tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_MUL(temp_4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - atomicAdd((Gik + 4 * ldG), tz); + Gik_4 = tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_MUL(temp_5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - atomicAdd((Gik + 5 * ldG), tz); + Gik_5 = tz; + + if constexpr (pure_bra) { + SCALAR_TYPE Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2; + + ::cuda::std::tie(Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2) = + sph::tform_l2(Gik_0, Gik_1, Gik_2, Gik_3, Gik_4, Gik_5); + atomicAdd((Gik + 0 * ldG), Gik_m2); + atomicAdd((Gik + 1 * ldG), Gik_m1); + atomicAdd((Gik + 2 * ldG), Gik_z0); + atomicAdd((Gik + 3 * ldG), Gik_p1); + atomicAdd((Gik + 4 * ldG), Gik_p2); + } else { + atomicAdd((Gik + 0 * ldG), Gik_0); + atomicAdd((Gik + 1 * ldG), Gik_1); + atomicAdd((Gik + 2 * ldG), Gik_2); + atomicAdd((Gik + 3 * ldG), Gik_3); + atomicAdd((Gik + 4 * ldG), Gik_4); + atomicAdd((Gik + 5 * ldG), Gik_5); + } atomicAdd((Gjk + 0 * ldG), tw); } @@ -700,16 +748,28 @@ struct DeviceTask20 { }; template -using AM20_swap = DeviceTask20; +using AM20_swap_cart = DeviceTask20; template -using AM20 = DeviceTask20; +using AM20_cart = DeviceTask20; +template +using AM20_swap_sph = DeviceTask20; + +template +using AM20_sph = DeviceTask20; void integral_2_0_task_batched( bool swap, + bool sph, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -731,21 +791,39 @@ using AM20 = DeviceTask20( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } else { - dev_integral_task_map_dispatcher( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu index 7b261336..47b6cec0 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_0.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -33,6 +37,7 @@ namespace XGPU { void integral_2_0_task_batched( bool swap, + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu index e5e84a01..153bcf7f 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -1197,7 +1201,8 @@ using namespace GauXC; } -template +template struct DeviceTask21 { static constexpr int max_primpair_shared_limit = 8; @@ -1210,7 +1215,7 @@ struct DeviceTask21 { static constexpr bool use_shared = (primpair_shared_limit > 0) && (primpair_shared_limit <= max_primpair_shared_limit); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; @@ -1240,8 +1245,8 @@ struct DeviceTask21 { const double Y_AB = param.Y_AB; const double Z_AB = param.Z_AB; - const int laneId = threadIdx.x % cuda::warp_size; - const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size; + const int laneId = threadIdx.x % GauXC::cuda::warp_size; + const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size; __shared__ GauXC::PrimitivePair s_prim_pairs[prim_buffer_size] __attribute__((unused)); @@ -1263,7 +1268,7 @@ struct DeviceTask21 { } for(int j = 0; j < 16; ++j) SCALAR_STORE((temp + j * blockDim.x + threadIdx.x), SCALAR_ZERO()); - const int pointIndex = i * cuda::warp_size + laneId; + const int pointIndex = i * GauXC::cuda::warp_size + laneId; if (pointIndex < npts) { const double point_x = s_task_data[pointIndex].x; @@ -1491,6 +1496,43 @@ struct DeviceTask21 { SCALAR_TYPE const_value_w; SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5; + SCALAR_TYPE Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5; + SCALAR_TYPE Xjk_0, Xjk_1, Xjk_2; + SCALAR_TYPE Gjk_0, Gjk_1, Gjk_2; + + if constexpr (pure_bra) { + SCALAR_TYPE Xik_m2 = SCALAR_LOAD((Xik + 0*ldX)); + SCALAR_TYPE Xik_m1 = SCALAR_LOAD((Xik + 1*ldX)); + SCALAR_TYPE Xik_z0 = SCALAR_LOAD((Xik + 2*ldX)); + SCALAR_TYPE Xik_p1 = SCALAR_LOAD((Xik + 3*ldX)); + SCALAR_TYPE Xik_p2 = SCALAR_LOAD((Xik + 4*ldX)); + + ::cuda::std::tie(Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5) = + sph::itform_l2(Xik_m2, Xik_m1, Xik_z0, Xik_p1, Xik_p2); + } else { + Xik_0 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 1*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 2*ldX)); + Xik_3 = SCALAR_LOAD((Xik + 3*ldX)); + Xik_4 = SCALAR_LOAD((Xik + 4*ldX)); + Xik_5 = SCALAR_LOAD((Xik + 5*ldX)); + } + + + if constexpr (pure_ket) { + Xjk_0 = SCALAR_LOAD((Xjk + 2*ldX)); + Xjk_1 = SCALAR_LOAD((Xjk + 0*ldX)); + Xjk_2 = SCALAR_LOAD((Xjk + 1*ldX)); + } else { + Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX)); + Xjk_1 = SCALAR_LOAD((Xjk + 1*ldX)); + Xjk_2 = SCALAR_LOAD((Xjk + 2*ldX)); + } + + Gjk_0 = 0; + Gjk_1 = 0; + Gjk_2 = 0; + X_ABp = 1.0; comb_m_i = 1.0; Y_ABp = 1.0; comb_n_j = 1.0; Z_ABp = 1.0; comb_p_k = 1.0; @@ -1499,53 +1541,47 @@ struct DeviceTask21 { /*** j = 0 ***/ - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tx = Xik_0; + ty = Xjk_0; t0 = SCALAR_LOAD((temp + 6 * blockDim.x + threadIdx.x)); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 7 * blockDim.x + threadIdx.x)); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 8 * blockDim.x + threadIdx.x)); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 9 * blockDim.x + threadIdx.x)); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 10 * blockDim.x + threadIdx.x)); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 11 * blockDim.x + threadIdx.x)); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); @@ -1554,54 +1590,48 @@ struct DeviceTask21 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 * blockDim.x + threadIdx.x)); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 * blockDim.x + threadIdx.x)); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 * blockDim.x + threadIdx.x)); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 * blockDim.x + threadIdx.x)); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 * blockDim.x + threadIdx.x)); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 * blockDim.x + threadIdx.x)); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - atomicAdd((Gjk + 0 * ldG), tw); + Gjk_0 += tw; /*** j = 1 ***/ @@ -1610,53 +1640,47 @@ struct DeviceTask21 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tx = Xik_0; + ty = Xjk_1; t0 = SCALAR_LOAD((temp + 7 * blockDim.x + threadIdx.x)); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 9 * blockDim.x + threadIdx.x)); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 10 * blockDim.x + threadIdx.x)); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 12 * blockDim.x + threadIdx.x)); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 13 * blockDim.x + threadIdx.x)); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 14 * blockDim.x + threadIdx.x)); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); @@ -1664,54 +1688,48 @@ struct DeviceTask21 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 * blockDim.x + threadIdx.x)); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 * blockDim.x + threadIdx.x)); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 * blockDim.x + threadIdx.x)); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 * blockDim.x + threadIdx.x)); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 * blockDim.x + threadIdx.x)); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 * blockDim.x + threadIdx.x)); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - atomicAdd((Gjk + 1 * ldG), tw); + Gjk_1 += tw; /*** j = 2 ***/ X_ABp = 1.0; comb_m_i = 1.0; @@ -1720,114 +1738,126 @@ struct DeviceTask21 { const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tx = Xik_0; + ty = Xjk_2; t0 = SCALAR_LOAD((temp + 8 * blockDim.x + threadIdx.x)); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 10 * blockDim.x + threadIdx.x)); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 11 * blockDim.x + threadIdx.x)); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 13 * blockDim.x + threadIdx.x)); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 14 * blockDim.x + threadIdx.x)); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 15 * blockDim.x + threadIdx.x)); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 * blockDim.x + threadIdx.x)); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 * blockDim.x + threadIdx.x)); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 * blockDim.x + threadIdx.x)); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 * blockDim.x + threadIdx.x)); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 * blockDim.x + threadIdx.x)); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 * blockDim.x + threadIdx.x)); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - atomicAdd((Gjk + 2 * ldG), tw); - - atomicAdd((Gik + 0 * ldG), outBuffer[0]); - atomicAdd((Gik + 1 * ldG), outBuffer[1]); - atomicAdd((Gik + 2 * ldG), outBuffer[2]); - atomicAdd((Gik + 3 * ldG), outBuffer[3]); - atomicAdd((Gik + 4 * ldG), outBuffer[4]); - atomicAdd((Gik + 5 * ldG), outBuffer[5]); + //atomicAdd((Gjk + 2 * ldG), tw); + Gjk_2 += tw; + + if constexpr (pure_ket) { + atomicAdd((Gjk + 2 * ldG), Gjk_0); + atomicAdd((Gjk + 0 * ldG), Gjk_1); + atomicAdd((Gjk + 1 * ldG), Gjk_2); + } else { + atomicAdd((Gjk + 0 * ldG), Gjk_0); + atomicAdd((Gjk + 1 * ldG), Gjk_1); + atomicAdd((Gjk + 2 * ldG), Gjk_2); + } + + if constexpr (pure_bra) { + SCALAR_TYPE Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2; + + ::cuda::std::tie(Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2) = + sph::tform_l2(outBuffer[0], outBuffer[1], outBuffer[2], + outBuffer[3], outBuffer[4], outBuffer[5]); + atomicAdd((Gik + 0 * ldG), Gik_m2); + atomicAdd((Gik + 1 * ldG), Gik_m1); + atomicAdd((Gik + 2 * ldG), Gik_z0); + atomicAdd((Gik + 3 * ldG), Gik_p1); + atomicAdd((Gik + 4 * ldG), Gik_p2); + } else { + atomicAdd((Gik + 0 * ldG), outBuffer[0]); + atomicAdd((Gik + 1 * ldG), outBuffer[1]); + atomicAdd((Gik + 2 * ldG), outBuffer[2]); + atomicAdd((Gik + 3 * ldG), outBuffer[3]); + atomicAdd((Gik + 4 * ldG), outBuffer[4]); + atomicAdd((Gik + 5 * ldG), outBuffer[5]); + } } } } @@ -1836,15 +1866,38 @@ struct DeviceTask21 { }; template -using AM21_swap = DeviceTask21; +using AM21_swap_cart = DeviceTask21; + +template +using AM21_cart = DeviceTask21; + +template +using AM21_swap_sc = DeviceTask21; + +template +using AM21_sc = DeviceTask21; + +template +using AM21_swap_sph = DeviceTask21; template -using AM21 = DeviceTask21; +using AM21_sph = DeviceTask21; void integral_2_1_task_batched( bool swap, + bool sph_2, bool sph_1, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -1865,21 +1918,55 @@ using AM21 = DeviceTask21( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph_2 and sph_1) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else if(sph_2) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } else { - dev_integral_task_map_dispatcher( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph_2 and sph_1) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else if(sph_2) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu index e501329c..e0038e32 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_1.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -39,6 +43,7 @@ namespace XGPU { void integral_2_1_task_batched( bool swap, + bool sph_2, bool sph_1, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu index 2216ede1..88c18b71 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -3106,7 +3110,8 @@ using namespace GauXC; } -template +template struct DeviceTask22 { static constexpr int max_primpair_shared_limit = 8; @@ -3120,7 +3125,7 @@ struct DeviceTask22 { static constexpr bool use_shared = (primpair_shared_limit > 0) && (primpair_shared_limit <= max_primpair_shared_limit); - static constexpr int num_warps = points_per_subtask / cuda::warp_size; + static constexpr int num_warps = points_per_subtask / GauXC::cuda::warp_size; // Cannot declare shared memory array with length 0 static constexpr int prim_buffer_size = (use_shared) ? num_warps * primpair_shared_limit : 1; @@ -3150,8 +3155,8 @@ struct DeviceTask22 { const double Y_AB = param.Y_AB; const double Z_AB = param.Z_AB; - const int laneId = threadIdx.x % cuda::warp_size; - const int warpId __attribute__((unused)) = threadIdx.x / cuda::warp_size; + const int laneId = threadIdx.x % GauXC::cuda::warp_size; + const int warpId __attribute__((unused)) = threadIdx.x / GauXC::cuda::warp_size; __shared__ GauXC::PrimitivePair s_prim_pairs[prim_buffer_size] __attribute__((unused)); @@ -3173,7 +3178,7 @@ struct DeviceTask22 { for(int j = 0; j < 31; ++j) SCALAR_STORE((temp + j), SCALAR_ZERO()); - const int pointIndex = i * cuda::warp_size + laneId; + const int pointIndex = i * GauXC::cuda::warp_size + laneId; if (pointIndex < npts) { const double point_x = s_task_data[pointIndex].x; @@ -3592,58 +3597,99 @@ struct DeviceTask22 { SCALAR_TYPE const_value_w; SCALAR_TYPE tx, ty, tz, tw, t0, t1, t2, t3, t4, t5; + SCALAR_TYPE Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5; + SCALAR_TYPE Xjk_0, Xjk_1, Xjk_2, Xjk_3, Xjk_4, Xjk_5; + SCALAR_TYPE Gjk_0, Gjk_1, Gjk_2, Gjk_3, Gjk_4, Gjk_5; + + if constexpr (pure_bra) { + SCALAR_TYPE Xik_m2 = SCALAR_LOAD((Xik + 0*ldX)); + SCALAR_TYPE Xik_m1 = SCALAR_LOAD((Xik + 1*ldX)); + SCALAR_TYPE Xik_z0 = SCALAR_LOAD((Xik + 2*ldX)); + SCALAR_TYPE Xik_p1 = SCALAR_LOAD((Xik + 3*ldX)); + SCALAR_TYPE Xik_p2 = SCALAR_LOAD((Xik + 4*ldX)); + + ::cuda::std::tie(Xik_0, Xik_1, Xik_2, Xik_3, Xik_4, Xik_5) = + sph::itform_l2(Xik_m2, Xik_m1, Xik_z0, Xik_p1, Xik_p2); + } else { + Xik_0 = SCALAR_LOAD((Xik + 0*ldX)); + Xik_1 = SCALAR_LOAD((Xik + 1*ldX)); + Xik_2 = SCALAR_LOAD((Xik + 2*ldX)); + Xik_3 = SCALAR_LOAD((Xik + 3*ldX)); + Xik_4 = SCALAR_LOAD((Xik + 4*ldX)); + Xik_5 = SCALAR_LOAD((Xik + 5*ldX)); + } + + if constexpr (pure_ket) { + SCALAR_TYPE Xjk_m2 = SCALAR_LOAD((Xjk + 0*ldX)); + SCALAR_TYPE Xjk_m1 = SCALAR_LOAD((Xjk + 1*ldX)); + SCALAR_TYPE Xjk_z0 = SCALAR_LOAD((Xjk + 2*ldX)); + SCALAR_TYPE Xjk_p1 = SCALAR_LOAD((Xjk + 3*ldX)); + SCALAR_TYPE Xjk_p2 = SCALAR_LOAD((Xjk + 4*ldX)); + + ::cuda::std::tie(Xjk_0, Xjk_1, Xjk_2, Xjk_3, Xjk_4, Xjk_5) = + sph::itform_l2(Xjk_m2, Xjk_m1, Xjk_z0, Xjk_p1, Xjk_p2); + } else { + Xjk_0 = SCALAR_LOAD((Xjk + 0*ldX)); + Xjk_1 = SCALAR_LOAD((Xjk + 1*ldX)); + Xjk_2 = SCALAR_LOAD((Xjk + 2*ldX)); + Xjk_3 = SCALAR_LOAD((Xjk + 3*ldX)); + Xjk_4 = SCALAR_LOAD((Xjk + 4*ldX)); + Xjk_5 = SCALAR_LOAD((Xjk + 5*ldX)); + } + + Gjk_0 = 0; + Gjk_1 = 0; + Gjk_2 = 0; + Gjk_3 = 0; + Gjk_4 = 0; + Gjk_5 = 0; + X_ABp = 1.0; comb_m_i = 1.0; Y_ABp = 1.0; comb_n_j = 1.0; Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 0 * ldX)); + tx = Xik_0; + ty = Xjk_0; t0 = SCALAR_LOAD((temp + 16 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 17 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 18 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 19 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 20 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 21 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 2, SCALAR_RECIPROCAL(1)); @@ -3651,52 +3697,46 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 6 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 7 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 8 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 9 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 10 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 11 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(2)); @@ -3704,54 +3744,49 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - if constexpr (!diag) atomicAdd((Gjk + 0 * ldG), tw); + //if constexpr (!diag) atomicAdd((Gjk + 0 * ldG), tw); + if constexpr (!diag) Gjk_0 += tw; @@ -3760,105 +3795,93 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 1 * ldX)); + tx = Xik_0; + ty = Xjk_1; t0 = SCALAR_LOAD((temp + 17 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 19 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 20 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 22 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 23 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 24 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 6 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 7 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 8 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 9 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 10 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 11 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); @@ -3866,106 +3889,95 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 7 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 9 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 10 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 12 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 13 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 14 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - if constexpr (!diag) atomicAdd((Gjk + 1 * ldG), tw); + //if constexpr (!diag) atomicAdd((Gjk + 1 * ldG), tw); + if constexpr (!diag) Gjk_1 += tw; @@ -3975,104 +3987,92 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 2 * ldX)); + tx = Xik_0; + ty = Xjk_2; t0 = SCALAR_LOAD((temp + 18 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 20 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 21 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 23 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 24 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 25 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 6 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 7 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 8 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 9 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 10 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 11 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; X_ABp = SCALAR_MUL(X_ABp, X_AB); comb_m_i = SCALAR_MUL(comb_m_i * 1, SCALAR_RECIPROCAL(1)); @@ -4080,105 +4080,94 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 8 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 10 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 11 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 13 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 14 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 15 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - if constexpr (!diag) atomicAdd((Gjk + 2 * ldG), tw); + //if constexpr (!diag) atomicAdd((Gjk + 2 * ldG), tw); + if constexpr (!diag) Gjk_2 += tw; @@ -4189,159 +4178,142 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 3 * ldX)); + tx = Xik_0; + ty = Xjk_3; t0 = SCALAR_LOAD((temp + 19 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 22 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 23 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 26 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 27 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 28 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 2, SCALAR_RECIPROCAL(1)); Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 7 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 9 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 10 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 12 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 13 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 14 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(2)); Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - if constexpr (!diag) atomicAdd((Gjk + 3 * ldG), tw); + //if constexpr (!diag) atomicAdd((Gjk + 3 * ldG), tw); + if constexpr (!diag) Gjk_3 += tw; @@ -4350,209 +4322,186 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 4 * ldX)); + tx = Xik_0; + ty = Xjk_4; t0 = SCALAR_LOAD((temp + 20 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 23 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 24 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 27 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 28 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 29 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 7 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 9 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 10 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 12 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 13 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 14 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Y_ABp = SCALAR_MUL(Y_ABp, Y_AB); comb_n_j = SCALAR_MUL(comb_n_j * 1, SCALAR_RECIPROCAL(1)); Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 8 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 10 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 11 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 13 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 14 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 15 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(1)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - if constexpr (!diag) atomicAdd((Gjk + 4 * ldG), tw); + //if constexpr (!diag) atomicAdd((Gjk + 4 * ldG), tw); + if constexpr (!diag) Gjk_4 += tw; @@ -4562,164 +4511,181 @@ struct DeviceTask22 { Z_ABp = 1.0; comb_p_k = 1.0; const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); - ty = SCALAR_LOAD((Xjk + 5 * ldX)); + tx = Xik_0; + ty = Xjk_5; t0 = SCALAR_LOAD((temp + 21 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_MUL(tx, t0); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 24 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 25 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 28 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 29 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 30 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 2, SCALAR_RECIPROCAL(1)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 8 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 10 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 11 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 13 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 14 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 15 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; Z_ABp = SCALAR_MUL(Z_ABp, Z_AB); comb_p_k = SCALAR_MUL(comb_p_k * 1, SCALAR_RECIPROCAL(2)); const_value = comb_m_i * comb_n_j * comb_p_k * X_ABp * Y_ABp * Z_ABp; const_value_w = SCALAR_MUL(const_value_v, const_value); - tx = SCALAR_LOAD((Xik + 0 * ldX)); + tx = Xik_0; t0 = SCALAR_LOAD((temp + 0 )); t0 = SCALAR_MUL(t0, const_value_w); tz = SCALAR_MUL(ty, t0); tw = SCALAR_FMA(tx, t0, tw); - //atomicAdd((Gik + 0 * ldG), tz); outBuffer[0] += tz; - tx = SCALAR_LOAD((Xik + 1 * ldX)); + tx = Xik_1; t1 = SCALAR_LOAD((temp + 1 )); t1 = SCALAR_MUL(t1, const_value_w); tz = SCALAR_MUL(ty, t1); tw = SCALAR_FMA(tx, t1, tw); - //atomicAdd((Gik + 1 * ldG), tz); outBuffer[1] += tz; - tx = SCALAR_LOAD((Xik + 2 * ldX)); + tx = Xik_2; t2 = SCALAR_LOAD((temp + 2 )); t2 = SCALAR_MUL(t2, const_value_w); tz = SCALAR_MUL(ty, t2); tw = SCALAR_FMA(tx, t2, tw); - //atomicAdd((Gik + 2 * ldG), tz); outBuffer[2] += tz; - tx = SCALAR_LOAD((Xik + 3 * ldX)); + tx = Xik_3; t3 = SCALAR_LOAD((temp + 3 )); t3 = SCALAR_MUL(t3, const_value_w); tz = SCALAR_MUL(ty, t3); tw = SCALAR_FMA(tx, t3, tw); - //atomicAdd((Gik + 3 * ldG), tz); outBuffer[3] += tz; - tx = SCALAR_LOAD((Xik + 4 * ldX)); + tx = Xik_4; t4 = SCALAR_LOAD((temp + 4 )); t4 = SCALAR_MUL(t4, const_value_w); tz = SCALAR_MUL(ty, t4); tw = SCALAR_FMA(tx, t4, tw); - //atomicAdd((Gik + 4 * ldG), tz); outBuffer[4] += tz; - tx = SCALAR_LOAD((Xik + 5 * ldX)); + tx = Xik_5; t5 = SCALAR_LOAD((temp + 5 )); t5 = SCALAR_MUL(t5, const_value_w); tz = SCALAR_MUL(ty, t5); tw = SCALAR_FMA(tx, t5, tw); - //atomicAdd((Gik + 5 * ldG), tz); outBuffer[5] += tz; - if constexpr (!diag) atomicAdd((Gjk + 5 * ldG), tw); - - atomicAdd((Gik + 0 * ldG), outBuffer[0]); - atomicAdd((Gik + 1 * ldG), outBuffer[1]); - atomicAdd((Gik + 2 * ldG), outBuffer[2]); - atomicAdd((Gik + 3 * ldG), outBuffer[3]); - atomicAdd((Gik + 4 * ldG), outBuffer[4]); - atomicAdd((Gik + 5 * ldG), outBuffer[5]); + //if constexpr (!diag) atomicAdd((Gjk + 5 * ldG), tw); + if constexpr (!diag) Gjk_5 += tw; + + if constexpr (!diag) { + if constexpr (pure_ket) { + SCALAR_TYPE Gjk_m2, Gjk_m1, Gjk_z0, Gjk_p1, Gjk_p2; + + ::cuda::std::tie(Gjk_m2, Gjk_m1, Gjk_z0, Gjk_p1, Gjk_p2) = + sph::tform_l2(Gjk_0, Gjk_1, Gjk_2, Gjk_3, Gjk_4, Gjk_5); + atomicAdd((Gjk + 0 * ldG), Gjk_m2); + atomicAdd((Gjk + 1 * ldG), Gjk_m1); + atomicAdd((Gjk + 2 * ldG), Gjk_z0); + atomicAdd((Gjk + 3 * ldG), Gjk_p1); + atomicAdd((Gjk + 4 * ldG), Gjk_p2); + } else { + atomicAdd((Gjk + 0 * ldG), Gjk_0); + atomicAdd((Gjk + 1 * ldG), Gjk_1); + atomicAdd((Gjk + 2 * ldG), Gjk_2); + atomicAdd((Gjk + 3 * ldG), Gjk_3); + atomicAdd((Gjk + 4 * ldG), Gjk_4); + atomicAdd((Gjk + 5 * ldG), Gjk_5); + } + } + + if constexpr (pure_bra) { + SCALAR_TYPE Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2; + + ::cuda::std::tie(Gik_m2, Gik_m1, Gik_z0, Gik_p1, Gik_p2) = + sph::tform_l2(outBuffer[0], outBuffer[1], outBuffer[2], + outBuffer[3], outBuffer[4], outBuffer[5]); + atomicAdd((Gik + 0 * ldG), Gik_m2); + atomicAdd((Gik + 1 * ldG), Gik_m1); + atomicAdd((Gik + 2 * ldG), Gik_z0); + atomicAdd((Gik + 3 * ldG), Gik_p1); + atomicAdd((Gik + 4 * ldG), Gik_p2); + } else { + atomicAdd((Gik + 0 * ldG), outBuffer[0]); + atomicAdd((Gik + 1 * ldG), outBuffer[1]); + atomicAdd((Gik + 2 * ldG), outBuffer[2]); + atomicAdd((Gik + 3 * ldG), outBuffer[3]); + atomicAdd((Gik + 4 * ldG), outBuffer[4]); + atomicAdd((Gik + 5 * ldG), outBuffer[5]); + } } } } @@ -4728,14 +4694,27 @@ struct DeviceTask22 { }; template -using AM22 = DeviceTask22; +using AM22_cart = DeviceTask22; + +template +using AM2_cart = DeviceTask22; + +template +using AM22_sph = DeviceTask22; template -using AM2 = DeviceTask22; +using AM2_sph = DeviceTask22; void integral_2_2_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -4755,16 +4734,26 @@ using AM2 = DeviceTask22( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } void integral_2_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpair, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -4784,13 +4773,22 @@ using AM2 = DeviceTask22( - nblocks, nthreads, max_primpair, stream, - ntasks, nsubtask, - device_tasks, task2sp, - (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, - sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, - boys_table ); + if(sph) + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); + else + dev_integral_task_map_dispatcher( + nblocks, nthreads, max_primpair, stream, + ntasks, nsubtask, + device_tasks, task2sp, + (int4*) subtasks, nprim_pairs_device, prim_pair_ptr_device, + sp_X_AB_device, sp_Y_AB_device, sp_Z_AB_device, + boys_table ); } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu index 4c0c42a6..12fe23e3 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/integral_2_2.hu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -38,6 +42,7 @@ namespace XGPU { cudaStream_t stream); void integral_2_2_task_batched( + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -53,6 +58,7 @@ namespace XGPU { cudaStream_t stream); void integral_2_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpairs, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu index dc975f55..267c195a 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/obara_saika_integrals.cu @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu index ff88fb9f..df85fa52 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/src/task_map_base.hu @@ -1,8 +1,47 @@ #include "device_specific/cuda_device_constants.hpp" #include "../../cuda_aos_scheme1.hpp" +#include +#include namespace XGPU { +namespace constants { + constexpr double sqrt_3 = 1.7320508075688772; +} + +namespace sph { + +__inline__ __device__ auto tform_l2( + double xx, double xy, double xz, double yy, double yz, double zz +) { + + double m2 = constants::sqrt_3 * xy; + double m1 = constants::sqrt_3 * yz; + double z0 = zz - 0.5 * (xx + yy); + double p1 = constants::sqrt_3 * xz; + double p2 = constants::sqrt_3 * 0.5 * (xx - yy); + + return cuda::std::make_tuple(m2, m1, z0, p1, p2); + +} + +__inline__ __device__ auto itform_l2( + double m2, double m1, double z0, double p1, double p2 +) { + + double xx = 0.5 * (-z0 + constants::sqrt_3 * p2); + double xy = constants::sqrt_3 * m2; + double xz = constants::sqrt_3 * p1; + double yy = -0.5 * (z0 + constants::sqrt_3 * p2); + double yz = constants::sqrt_3 * m1; + double zz = z0; + + return cuda::std::make_tuple(xx,xy,xz,yy,yz,zz); + +} + +} + using namespace GauXC; @@ -15,7 +54,7 @@ __inline__ __device__ void load_primpair_shared( int32_t* dst = (int32_t*) dst_t; const int num_transfers = n * sizeof(GauXC::PrimitivePair) / sizeof(int32_t); - for (int i = laneId; i < num_transfers; i += cuda::warp_size) { + for (int i = laneId; i < num_transfers; i += GauXC::cuda::warp_size) { dst[i] = src[i]; } } @@ -110,7 +149,7 @@ void task_map_kernel( __shared__ double4 s_task_data[points_per_subtask]; - const int warpId = threadIdx.x / cuda::warp_size; + const int warpId = threadIdx.x / GauXC::cuda::warp_size; const int i_subtask = blockIdx.x; const int i_task = subtasks[i_subtask].x; diff --git a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt index bd78bb84..93b1b589 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/device/cuda/obara_saika/test/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx index 3c11a5fe..a51e6bcd 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -16,12 +20,10 @@ namespace GauXC { -void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ){ - - if( do_grad ) GAUXC_GENERIC_EXCEPTION("CUTLASS + X Gradient NYI"); - if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI"); - - auto* data = dynamic_cast(_data); +// Common implementation for eval_xmat and eval_xmat_trial +template +void AoSScheme1CUTLASSBase::eval_xmat_impl(double fac, XCDeviceData* _data, bool do_grad, density_id den_id) { + auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); @@ -34,33 +36,91 @@ void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_g const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - sym_pack_submat( ntasks, aos_stack.device_tasks, static_stack.dmat_s_device, + + double* dmat_ptr; + if constexpr (is_trial) { + dmat_ptr = static_stack.tden_selector(den_id); + // now screened trial density matrix is stored in aos_stack.device_tasks[itask].nbe_scr + } else { + dmat_ptr = static_stack.den_selector(den_id); + } + + sym_pack_submat( ntasks, aos_stack.device_tasks, dmat_ptr, nbf, submat_block_size, data->device_backend_->queue() ); auto cutlass_stack = data->cutlass_stack; + double** dmat_array; + if constexpr (is_trial) { + dmat_array = cutlass_stack.tdmat_array(den_id); + } else { + dmat_array = cutlass_stack.dmat_array(den_id); + } cutlass_gemm( cutlass_stack.problem_sizes_device, data->problem_sizes_host.data(), ntasks, - cutlass_stack.bf_array_device, cutlass_stack.dmat_array_device, + cutlass_stack.bf_array_device, dmat_array, cutlass_stack.zmat_array_device, cutlass_stack.zmat_array_device, cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device, cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device, fac, 0.0, data->device_backend_->queue() ); + + if(do_grad) { + cutlass_gemm( + cutlass_stack.problem_sizes_device, + data->problem_sizes_host.data(), + ntasks, + cutlass_stack.bfx_array_device, dmat_array, + cutlass_stack.xmat_x_array_device, cutlass_stack.xmat_x_array_device, + cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device, + cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device, + fac, 0.0, + data->device_backend_->queue() + ); + cutlass_gemm( + cutlass_stack.problem_sizes_device, + data->problem_sizes_host.data(), + ntasks, + cutlass_stack.bfy_array_device, dmat_array, + cutlass_stack.xmat_y_array_device, cutlass_stack.xmat_y_array_device, + cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device, + cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device, + fac, 0.0, + data->device_backend_->queue() + ); + cutlass_gemm( + cutlass_stack.problem_sizes_device, + data->problem_sizes_host.data(), + ntasks, + cutlass_stack.bfz_array_device, dmat_array, + cutlass_stack.xmat_z_array_device, cutlass_stack.xmat_z_array_device, + cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_dmat_array_device, + cutlass_stack.ld64_zmat_array_device, cutlass_stack.ld64_zmat_array_device, + fac, 0.0, + data->device_backend_->queue() + ); + } +} + +void AoSScheme1CUTLASSBase::eval_xmat(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ) { + eval_xmat_impl(fac, _data, do_grad, den_id); } -void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id, bool do_m){ +void AoSScheme1CUTLASSBase::eval_xmat_trial(double fac, XCDeviceData* _data, bool do_grad, density_id den_id ) { + eval_xmat_impl(fac, _data, do_grad, den_id); +} - auto* data = dynamic_cast(_data); + +// Common implementation for inc_vxc and inc_fxc +template +void AoSScheme1CUTLASSBase::inc_potential_impl(XCDeviceData* _data, density_id den_id, bool do_m) { + auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - if(do_m) GAUXC_GENERIC_EXCEPTION("CUTLASS + MGGA NYI"); - if( den_id != DEN_S ) GAUXC_GENERIC_EXCEPTION("CUTLASS + U/GKS NYI"); - auto& tasks = data->host_device_tasks; const auto ntasks = tasks.size(); @@ -76,15 +136,66 @@ void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id, bo 1.0, 0.0, data->device_backend_->queue() ); - - // Increment global VXC + if(do_m) { + cutlass_syr2k( + cutlass_stack.syr2k_sizes_device, + data->syr2k_sizes_host.data(), + ntasks, + cutlass_stack.bfx_array_device, cutlass_stack.xmat_x_array_device, + cutlass_stack.vmat_array_device, cutlass_stack.vmat_array_device, + cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_zmat_array_device, + cutlass_stack.ld64_vmat_array_device, cutlass_stack.ld64_vmat_array_device, + 1.0, 1.0, + data->device_backend_->queue() + ); + cutlass_syr2k( + cutlass_stack.syr2k_sizes_device, + data->syr2k_sizes_host.data(), + ntasks, + cutlass_stack.bfy_array_device, cutlass_stack.xmat_y_array_device, + cutlass_stack.vmat_array_device, cutlass_stack.vmat_array_device, + cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_zmat_array_device, + cutlass_stack.ld64_vmat_array_device, cutlass_stack.ld64_vmat_array_device, + 1.0, 1.0, + data->device_backend_->queue() + ); + cutlass_syr2k( + cutlass_stack.syr2k_sizes_device, + data->syr2k_sizes_host.data(), + ntasks, + cutlass_stack.bfz_array_device, cutlass_stack.xmat_z_array_device, + cutlass_stack.vmat_array_device, cutlass_stack.vmat_array_device, + cutlass_stack.ld64_bf_array_device, cutlass_stack.ld64_zmat_array_device, + cutlass_stack.ld64_vmat_array_device, cutlass_stack.ld64_vmat_array_device, + 1.0, 1.0, + data->device_backend_->queue() + ); + } + + // Increment global VXC/FXC const auto nbf = data->global_dims.nbf; const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - sym_task_inc_potential( ntasks, aos_stack.device_tasks, - static_stack.vxc_s_device, nbf, submat_block_size, - data->device_backend_->queue() ); + + double* potential_ptr; + if constexpr (is_fxc) { + potential_ptr = static_stack.fxc_selector(den_id); + // cutlass_stack.vmat_array_device points to aos_stack.device_tasks[itask].nbe_scr + } else { + potential_ptr = static_stack.vxc_selector(den_id); + } + + sym_task_inc_potential( ntasks, aos_stack.device_tasks, potential_ptr, nbf, + submat_block_size, data->device_backend_->queue() ); +} + +void AoSScheme1CUTLASSBase::inc_vxc( XCDeviceData* _data, density_id den_id, bool do_m ) { + inc_potential_impl(_data, den_id, do_m); +} + +void AoSScheme1CUTLASSBase::inc_fxc( XCDeviceData* _data, density_id den_id, bool do_m ) { + inc_potential_impl(_data, den_id, do_m); } } diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp index 0f3ec69e..80b99116 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp +++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_base.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -17,8 +21,16 @@ namespace GauXC { struct AoSScheme1CUTLASSBase : public AoSScheme1Base { + template + void eval_xmat_impl(double fac, XCDeviceData*, bool do_grad, density_id ); + template + void inc_potential_impl(XCDeviceData*, density_id, bool do_m); + + void eval_xmat(double fac, XCDeviceData*, bool do_grad, density_id ) override final; + void eval_xmat_trial(double fac, XCDeviceData*, bool do_grad, density_id ) override final; void inc_vxc( XCDeviceData*, density_id, bool ) override final; + void inc_fxc( XCDeviceData*, density_id, bool ) override final; struct Data; @@ -32,11 +44,44 @@ struct AoSScheme1CUTLASSBase::Data : public AoSScheme1Base::Data { using base_type::device_buffer_t; struct cutlass_data { - double** dmat_array_device = nullptr; + double** dmat_s_array_device = nullptr; + double** dmat_z_array_device = nullptr; + double** dmat_y_array_device = nullptr; + double** dmat_x_array_device = nullptr; double** vmat_array_device = nullptr; double** zmat_array_device = nullptr; double** bf_array_device = nullptr; - + double** bfx_array_device = nullptr; + double** bfy_array_device = nullptr; + double** bfz_array_device = nullptr; + double** xmat_x_array_device = nullptr; + double** xmat_y_array_device = nullptr; + double** xmat_z_array_device = nullptr; + + double** tdmat_s_array_device = nullptr; + double** tdmat_z_array_device = nullptr; + double** tdmat_y_array_device = nullptr; + double** tdmat_x_array_device = nullptr; + + inline double** dmat_array(density_id id) { + switch(id) { + case DEN_S: return dmat_s_array_device; + case DEN_Z: return dmat_z_array_device; + case DEN_Y: return dmat_y_array_device; + case DEN_X: return dmat_x_array_device; + default: GAUXC_GENERIC_EXCEPTION("dmat_array: density_id not recognized"); + } + } + + inline double** tdmat_array(density_id id) { + switch(id) { + case DEN_S: return tdmat_s_array_device; + case DEN_Z: return tdmat_z_array_device; + case DEN_Y: return tdmat_y_array_device; + case DEN_X: return tdmat_x_array_device; + default: GAUXC_GENERIC_EXCEPTION("dmat_array: density_id not recognized"); + } + } cutlass::gemm::GemmCoord* problem_sizes_device = nullptr; cutlass::gemm::GemmCoord* syr2k_sizes_device = nullptr; diff --git a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx index 6bf35c75..3e5ee555 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/scheme1_cutlass_data_base.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -28,6 +32,8 @@ size_t AoSScheme1CUTLASSBase::Data::get_static_mem_requirement() { size_t AoSScheme1CUTLASSBase::Data::get_mem_req( integrator_term_tracker terms, const host_task_type& task ) { + auto is_uks = terms.ks_scheme == UKS; + auto is_gks = terms.ks_scheme == GKS; size_t base_size = base_type::get_mem_req(terms, task); @@ -35,10 +41,30 @@ size_t AoSScheme1CUTLASSBase::Data::get_mem_req( integrator_term_tracker terms, required_term_storage reqt(terms); if( reqt.task_nbe_scr ) { base_size += - 4*sizeof(double*) + // batch device pointers + 4*sizeof(double*) + // batch device pointers (containg trial ones) 4*sizeof(int64_t) + 2*sizeof(cutlass::gemm::GemmCoord); // Dimensions + leading dimensions // (extra handled by get_static_mem_requirement) + if(reqt.task_xmat_grad) { + base_size += 6 * sizeof(double*); + } + + if(is_uks or is_gks) { + base_size += sizeof(double*); // z dmat + } + if(is_gks) { + base_size += 2*sizeof(double*); // x/y dmat + } + + if(terms.fxc_contraction) { + base_size += sizeof(double*); // s tdmat + if(is_uks or is_gks) + base_size += sizeof(double*); // z tdmat + if(is_gks) { + base_size += 2*sizeof(double*); // x/y tdmat + } + } + } return base_size; @@ -57,23 +83,53 @@ AoSScheme1CUTLASSBase::Data::device_buffer_t required_term_storage reqt(terms); if( not reqt.task_nbe_scr ) return buf; + auto is_uks = terms.ks_scheme == UKS; + auto is_gks = terms.ks_scheme == GKS; + // Allocate additional device memory auto [ ptr, sz ] = buf; buffer_adaptor mem( ptr, sz ); const auto ntask = std::distance( task_begin, task_end ); - cutlass_stack.dmat_array_device = mem.aligned_alloc( ntask, csl ); - cutlass_stack.vmat_array_device = mem.aligned_alloc( ntask, csl ); - cutlass_stack.zmat_array_device = mem.aligned_alloc( ntask, csl ); - cutlass_stack.bf_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.dmat_s_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.vmat_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.zmat_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.bf_array_device = mem.aligned_alloc( ntask, csl ); + if(reqt.task_xmat_grad) { + cutlass_stack.bfx_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.bfy_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.bfz_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.xmat_x_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.xmat_y_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.xmat_z_array_device = mem.aligned_alloc( ntask, csl ); + } + + if(is_uks or is_gks) { + cutlass_stack.dmat_z_array_device = mem.aligned_alloc( ntask, csl ); + } + + if(is_gks) { + cutlass_stack.dmat_y_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.dmat_x_array_device = mem.aligned_alloc( ntask, csl ); + } + + if(terms.fxc_contraction) { + cutlass_stack.tdmat_s_array_device = mem.aligned_alloc( ntask, csl ); + if(is_uks or is_gks) + cutlass_stack.tdmat_z_array_device = mem.aligned_alloc( ntask, csl ); + if(is_gks){ + cutlass_stack.tdmat_y_array_device = mem.aligned_alloc( ntask, csl ); + cutlass_stack.tdmat_x_array_device = mem.aligned_alloc( ntask, csl ); + } + } cutlass_stack.ld64_dmat_array_device = mem.aligned_alloc( ntask + 1, csl ); cutlass_stack.ld64_zmat_array_device = mem.aligned_alloc( ntask + 1, csl ); cutlass_stack.ld64_vmat_array_device = mem.aligned_alloc( ntask + 1, csl ); cutlass_stack.ld64_bf_array_device = mem.aligned_alloc( ntask + 1, csl ); - cutlass_stack.problem_sizes_device = mem.aligned_alloc( ntask + 1, csl ); - cutlass_stack.syr2k_sizes_device = mem.aligned_alloc( ntask + 1, csl ); + cutlass_stack.problem_sizes_device = mem.aligned_alloc( ntask + 1, csl ); + cutlass_stack.syr2k_sizes_device = mem.aligned_alloc( ntask + 1, csl ); // Update dynmem data for derived impls return device_buffer_t{ mem.stack(), mem.nleft() }; @@ -88,15 +144,17 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send( required_term_storage reqt(terms); if( not reqt.task_nbe_scr ) return; + auto is_uks = terms.ks_scheme == UKS; + auto is_gks = terms.ks_scheme == GKS; + const auto ntask = std::distance( task_begin, task_end ); std::vector dmat_host( ntask ), zmat_host( ntask ), bf_host( ntask ), - vmat_host( ntask ); + vmat_host( ntask ), tdmat_host( ntask ); problem_sizes_host.resize(ntask); syr2k_sizes_host.resize(ntask); std::vector ld64_dmat_host( ntask ), ld64_zmat_host( ntask ), ld64_vmat_host( ntask ), ld64_bf_host( ntask ); - double* static_dmat = static_stack.dmat_s_device; const auto nbf = global_dims.nbf; // host_device_tasks should be populated by parent impl called at top @@ -109,7 +167,7 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send( dmat_host[i] = task.nbe_scr; ld64_dmat_host[i] = task.bfn_screening.nbe; } else { - dmat_host[i] = static_dmat + task.bfn_screening.ibf_begin*(nbf+1); + dmat_host[i] = static_stack.dmat_s_device + task.bfn_screening.ibf_begin*(nbf+1); ld64_dmat_host[i] = nbf; } @@ -118,12 +176,11 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send( cutlass::gemm::GemmCoord problem2(task.bfn_screening.nbe, task.bfn_screening.nbe, task.npts); syr2k_sizes_host[i] = problem2; - } // Send to device device_backend_->copy_async( ntask, dmat_host.data(), - cutlass_stack.dmat_array_device, "send dmat array" ); + cutlass_stack.dmat_s_array_device, "send dmat_s array" ); device_backend_->copy_async( ntask, zmat_host.data(), cutlass_stack.zmat_array_device, "send zmat array" ); device_backend_->copy_async( ntask, vmat_host.data(), @@ -144,6 +201,109 @@ void AoSScheme1CUTLASSBase::Data::pack_and_send( device_backend_->copy_async( ntask, ld64_bf_host.data(), cutlass_stack.ld64_bf_array_device, "send ld bf array" ); + if(is_uks or is_gks) { + std::vector dmat_z_host( ntask ); + for( auto i = 0; i < ntask; ++i ) { + auto& task = host_device_tasks[i]; + if( task.bfn_screening.ncut > 1 ) { + dmat_z_host[i] = task.nbe_scr; + } else { + dmat_z_host[i] = static_stack.dmat_z_device + task.bfn_screening.ibf_begin*(nbf+1); + } + } + device_backend_->copy_async( ntask, dmat_z_host.data(), + cutlass_stack.dmat_z_array_device, "send dmat_z array" ); + } + + if(is_gks) { + std::vector dmat_y_host( ntask ); + std::vector dmat_x_host( ntask ); + for( auto i = 0; i < ntask; ++i ) { + auto& task = host_device_tasks[i]; + if( task.bfn_screening.ncut > 1 ) { + dmat_y_host[i] = task.nbe_scr; + dmat_x_host[i] = task.nbe_scr; + } else { + dmat_y_host[i] = static_stack.dmat_y_device + task.bfn_screening.ibf_begin*(nbf+1); + dmat_x_host[i] = static_stack.dmat_x_device + task.bfn_screening.ibf_begin*(nbf+1); + } + } + device_backend_->copy_async( ntask, dmat_x_host.data(), + cutlass_stack.dmat_x_array_device, "send dmat_x array" ); + device_backend_->copy_async( ntask, dmat_y_host.data(), + cutlass_stack.dmat_y_array_device, "send dmat_y array" ); + } + + if(reqt.task_xmat_grad) { + std::vector xmat_x_host( ntask ), bfx_host( ntask ); + std::vector xmat_y_host( ntask ), bfy_host( ntask ); + std::vector xmat_z_host( ntask ), bfz_host( ntask ); + for( auto i = 0; i < ntask; ++i ) { + auto& task = host_device_tasks[i]; + xmat_x_host[i] = task.xmat_x; + xmat_y_host[i] = task.xmat_y; + xmat_z_host[i] = task.xmat_z; + bfx_host[i] = task.dbfx; + bfy_host[i] = task.dbfy; + bfz_host[i] = task.dbfz; + } + device_backend_->copy_async( ntask, xmat_x_host.data(), + cutlass_stack.xmat_x_array_device, "send xmat_x array" ); + device_backend_->copy_async( ntask, xmat_y_host.data(), + cutlass_stack.xmat_y_array_device, "send xmat_y array" ); + device_backend_->copy_async( ntask, xmat_z_host.data(), + cutlass_stack.xmat_z_array_device, "send xmat_z array" ); + device_backend_->copy_async( ntask, bfx_host.data(), + cutlass_stack.bfx_array_device, "send bfx array" ); + device_backend_->copy_async( ntask, bfy_host.data(), + cutlass_stack.bfy_array_device, "send bfy array" ); + device_backend_->copy_async( ntask, bfz_host.data(), + cutlass_stack.bfz_array_device, "send bfz array" ); + } + + if(terms.fxc_contraction) { + std::vector tdmat_host( ntask ); + for( auto i = 0; i < ntask; ++i ) { + auto& task = host_device_tasks[i]; + if( task.bfn_screening.ncut > 1 ) + tdmat_host[i] = task.nbe_scr; + else + tdmat_host[i] = static_stack.tdmat_s_device + task.bfn_screening.ibf_begin*(nbf+1); + } + device_backend_->copy_async( ntask, tdmat_host.data(), + cutlass_stack.tdmat_s_array_device, "send tdmat_s array" ); + if(is_uks or is_gks) { + std::vector tdmat_z_host( ntask ); + for( auto i = 0; i < ntask; ++i ) { + auto& task = host_device_tasks[i]; + if( task.bfn_screening.ncut > 1 ) + tdmat_z_host[i] = task.nbe_scr; + else + tdmat_z_host[i] = static_stack.tdmat_z_device + task.bfn_screening.ibf_begin*(nbf+1); + } + device_backend_->copy_async( ntask, tdmat_z_host.data(), + cutlass_stack.tdmat_z_array_device, "send tdmat_z array" ); + } + if(is_gks) { + std::vector tdmat_y_host( ntask ); + std::vector tdmat_x_host( ntask ); + for( auto i = 0; i < ntask; ++i ) { + auto& task = host_device_tasks[i]; + if( task.bfn_screening.ncut > 1 ) { + tdmat_y_host[i] = task.nbe_scr; + tdmat_x_host[i] = task.nbe_scr; + } else { + tdmat_y_host[i] = static_stack.tdmat_y_device + task.bfn_screening.ibf_begin*(nbf+1); + tdmat_x_host[i] = static_stack.tdmat_x_device + task.bfn_screening.ibf_begin*(nbf+1); + } + } + device_backend_->copy_async( ntask, tdmat_x_host.data(), + cutlass_stack.tdmat_x_array_device, "send tdmat_x array" ); + device_backend_->copy_async( ntask, tdmat_y_host.data(), + cutlass_stack.tdmat_y_array_device, "send tdmat_y array" ); + } + } + device_backend_->master_queue_synchronize(); } diff --git a/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx index a77f134a..838078cf 100644 --- a/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx +++ b/src/xc_integrator/local_work_driver/device/cuda/xc_functional_eval_wrapper.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -37,4 +41,35 @@ void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts, } + + +void eval_kern_vxc_fxc_lda( const functional_type& func, size_t npts, + const double* rho, double* vrho, double* v2rho2, device_queue queue ) { + + cudaStream_t stream = queue.queue_as(); + func.eval_vxc_fxc_device( npts, rho, vrho, v2rho2, stream ); +} + +void eval_kern_vxc_fxc_gga( const functional_type& func, size_t npts, + const double* rho, const double* gamma, double* vrho, double* vgamma, + double* v2rho2, double* v2rhogamma, double* v2gamma2, device_queue queue ) { + + cudaStream_t stream = queue.queue_as(); + func.eval_vxc_fxc_device( npts, rho, gamma, vrho, vgamma, v2rho2, v2rhogamma, v2gamma2, stream ); +} + +void eval_kern_vxc_fxc_mgga( const functional_type& func, size_t npts, + const double* rho, const double* gamma, const double* lapl, const double* tau, + double* vrho, double* vgamma, double* vlapl, double* vtau, + double* v2rho2, double* v2rhogamma, double* v2rholapl, double* v2rhotau, + double* v2gamma2, double* v2gammalapl, double* v2gammatau, double* v2lapl2, + double* v2lapltau, double* v2tau2, device_queue queue ){ + + cudaStream_t stream = queue.queue_as(); + func.eval_vxc_fxc_device( npts, rho, gamma, lapl, tau, vrho, vgamma, vlapl, vtau, + v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, v2gammalapl, v2gammatau, + v2lapl2, v2lapltau, v2tau2, stream ); +} + + } diff --git a/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt b/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt index 3c6bddbd..a7b14ce4 100644 --- a/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/device/hip/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx index 5a1d4c78..b5d6f499 100644 --- a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx +++ b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp index a6551d5c..00da2e16 100644 --- a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx index 3fb77bcc..c80c9a61 100644 --- a/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx +++ b/src/xc_integrator/local_work_driver/device/hip/hip_aos_scheme1_data.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp index 3bcb5b57..70008f8d 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_cartesian.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp index 5a5e78a5..987a13df 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_angular_spherical_unnorm.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp index c7405df7..ae8c43e7 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_device_constants.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp index 47b8ef55..102fb8b8 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation/collocation_spherical_unnorm.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip index dc2cffea..4af37bbd 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_device.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp index c8020819..fa24862b 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_combined_kernels.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp index 527eda7b..cf14c269 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/collocation_masked_kernels.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip index 953bc00f..f830596c 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp index e2a3d579..efbb9ad3 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/grid_to_center.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp index 7ba02d5a..2d3e537c 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_extensions.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip index 953bbc34..1e9044a7 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_inc_potential.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip index 67af3a41..4c6d5874 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp index ed31ed12..66e91b8a 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssf_1d.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip index d4f6eda9..385e0160 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp index 09371726..788f94d1 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hip_ssh_2d.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip index 3aa45dee..8848ed38 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/hipblas_extensions.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip index a2a69d24..d415139b 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/pack_submat.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip index 946097cb..c418d0a5 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/symmetrize_mat.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip index 8e93d33f..0d8f2d04 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip index d188e9e6..673d5a5f 100644 --- a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip +++ b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx index da8544ce..dccc9bf9 100644 --- a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx +++ b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx index 2a83e76c..89626b46 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -48,16 +52,37 @@ void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, density_id den, boo pimpl_->NAME(device_data, den, b); \ } +#define FWD_TO_PIMPL_BOOL_DEN_ID(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, bool b, density_id den ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, b, den); \ +} + #define FWD_TO_PIMPL_KS_SCHEME(NAME) \ void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track ) { \ throw_if_invalid_pimpl(pimpl_); \ pimpl_->NAME(device_data, track); \ } +#define FWD_TO_PIMPL_KS_SCHEME_BOOL(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, bool b ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, track, b); \ +} +#define FWD_TO_PIMPL_KS_SCHEME_BOOL_BOOL(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, bool b1, bool b2 ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, track, b1, b2); \ +} #define FWD_TO_PIMPL_KS_SCHEME_DEN_ID(NAME) \ void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, density_id den ) { \ throw_if_invalid_pimpl(pimpl_); \ pimpl_->NAME(device_data, track, den); \ } +#define FWD_TO_PIMPL_KS_SCHEME_BOOL_DEN_ID(NAME) \ +void LocalDeviceWorkDriver::NAME( XCDeviceData* device_data, integrator_ks_scheme track, bool b, density_id den ) { \ + throw_if_invalid_pimpl(pimpl_); \ + pimpl_->NAME(device_data, track, b, den); \ +} FWD_TO_PIMPL(partition_weights) // Partition weights @@ -65,31 +90,50 @@ FWD_TO_PIMPL(eval_collocation) // Collocation FWD_TO_PIMPL(eval_collocation_gradient) // Collocation Gradient FWD_TO_PIMPL(eval_collocation_hessian) // Collocation Hessian FWD_TO_PIMPL(eval_collocation_laplacian) // Collocation Laplacian +FWD_TO_PIMPL(eval_collocation_lapgrad) // Collocation Laplacian gradient FWD_TO_PIMPL_KS_SCHEME(eval_uvars_lda) // U variables LDA (rho) FWD_TO_PIMPL_KS_SCHEME(eval_uvars_gga) // U variables GGA (gamma) -FWD_TO_PIMPL_BOOL(eval_uvars_mgga) // U variables MGGA (tau, lapl) -FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvar) // V variable (density + grad) +FWD_TO_PIMPL_KS_SCHEME_BOOL(eval_uvars_mgga) // U variables MGGA (tau, lapl) +FWD_TO_PIMPL_DEN_ID(eval_vvars_lda) // V variables LDA (density) +FWD_TO_PIMPL_DEN_ID(eval_vvars_gga) // V variables GGA (density + grad) +FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvars_mgga) // V variables MGGA (density + grad + tau + lapl) + +FWD_TO_PIMPL_KS_SCHEME(eval_tmat_lda) // T variables LDA (trho) +FWD_TO_PIMPL_KS_SCHEME(eval_tmat_gga) // T variables GGA (tgamma) +FWD_TO_PIMPL_KS_SCHEME_BOOL(eval_tmat_mgga) // T variables MGGA (ttau, tlapl) +FWD_TO_PIMPL_DEN_ID(eval_vvars_lda_trial) // V variables LDA (trial density) +FWD_TO_PIMPL_DEN_ID(eval_vvars_gga_trial) // V variables GGA (trial density + grad) +FWD_TO_PIMPL_DEN_ID_BOOL(eval_vvars_mgga_trial) // V variables MGGA (trial density + grad + tau + lapl) FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_lda_vxc) // Eval Z Matrix LDA VXC FWD_TO_PIMPL_KS_SCHEME_DEN_ID(eval_zmat_gga_vxc) // Eval Z Matrix GGA VXC -FWD_TO_PIMPL_BOOL(eval_zmat_mgga_vxc) // Eval Z Matrix mGGA VXC -FWD_TO_PIMPL_BOOL(eval_mmat_mgga_vxc) // Eval M Matrix mGGA VXC +FWD_TO_PIMPL_KS_SCHEME_BOOL_DEN_ID(eval_zmat_mgga_vxc) // Eval Z Matrix mGGA VXC +FWD_TO_PIMPL_KS_SCHEME_BOOL_DEN_ID(eval_mmat_mgga_vxc) // Eval M Matrix mGGA VXC + +FWD_TO_PIMPL_DEN_ID(eval_zmat_lda_fxc) // Eval Z Matrix LDA FXC +FWD_TO_PIMPL_DEN_ID(eval_zmat_gga_fxc) // Eval Z Matrix GGA FXC +FWD_TO_PIMPL_BOOL_DEN_ID(eval_zmat_mgga_fxc) // Eval Z Matrix mGGA FXC +FWD_TO_PIMPL_BOOL_DEN_ID(eval_mmat_mgga_fxc) // Eval M Matrix mGGA FXC + FWD_TO_PIMPL(eval_exx_fmat) // Eval EXX F Matrix -//FWD_TO_PIMPL(eval_exx_gmat) // Eval EXX G Matrix +//FWD_TO_PIMPL(eval_exx_gmat) // Eval EXX G Matrix FWD_TO_PIMPL(inc_exc) FWD_TO_PIMPL(inc_nel) FWD_TO_PIMPL_DEN_ID_BOOL(inc_vxc) // Increment VXC_I by Z +FWD_TO_PIMPL_DEN_ID_BOOL(inc_fxc) // Increment FXC_I by Z FWD_TO_PIMPL(inc_exx_k) -FWD_TO_PIMPL(inc_exc_grad_lda) -FWD_TO_PIMPL(inc_exc_grad_gga) +FWD_TO_PIMPL_KS_SCHEME_BOOL(inc_exc_grad_lda) +FWD_TO_PIMPL_KS_SCHEME_BOOL(inc_exc_grad_gga) +FWD_TO_PIMPL_KS_SCHEME_BOOL_BOOL(inc_exc_grad_mgga) FWD_TO_PIMPL_DEN_ID(symmetrize_vxc) +FWD_TO_PIMPL_DEN_ID(symmetrize_fxc) // Added FXC function FWD_TO_PIMPL(symmetrize_exx_k) FWD_TO_PIMPL(eval_exx_ek_screening_bfn_stats) @@ -100,7 +144,14 @@ void LocalDeviceWorkDriver::eval_xmat( double fac, XCDeviceData* device_data, bo throw_if_invalid_pimpl(pimpl_); pimpl_->eval_xmat(fac, device_data, do_grad, den); } - +void LocalDeviceWorkDriver::save_xmat( XCDeviceData* device_data, bool do_grad, density_id den ) { + throw_if_invalid_pimpl(pimpl_); + pimpl_->save_xmat(device_data, do_grad, den); +} +void LocalDeviceWorkDriver::eval_xmat_trial( double fac, XCDeviceData* device_data, bool do_grad, density_id den ) { + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_xmat_trial(fac, device_data, do_grad, den); +} void LocalDeviceWorkDriver::eval_exx_gmat( XCDeviceData* device_data, const BasisSetMap& basis_map) { @@ -126,6 +177,23 @@ void LocalDeviceWorkDriver::eval_kern_exc_vxc_mgga( const functional_type& func, pimpl_->eval_kern_exc_vxc_mgga(func,data); } +void LocalDeviceWorkDriver::eval_kern_vxc_fxc_lda( const functional_type& func, + XCDeviceData* data) { + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_kern_vxc_fxc_lda(func,data); +} + +void LocalDeviceWorkDriver::eval_kern_vxc_fxc_gga( const functional_type& func, + XCDeviceData* data) { + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_kern_vxc_fxc_gga(func,data); +} + +void LocalDeviceWorkDriver::eval_kern_vxc_fxc_mgga( const functional_type& func, + XCDeviceData* data) { + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_kern_vxc_fxc_mgga(func,data); +} std::unique_ptr LocalDeviceWorkDriver::create_device_data(const DeviceRuntimeEnvironment& rt) { throw_if_invalid_pimpl(pimpl_); @@ -139,4 +207,9 @@ void LocalDeviceWorkDriver::exx_ek_shellpair_collision( double eps_E, double eps pimpl_->exx_ek_shellpair_collision( eps_E, eps_K, device_data, tb, te, shpairs ); } +void LocalDeviceWorkDriver::eval_weight_1st_deriv_contracted( XCDeviceData* device_data, XCWeightAlg alg ) { + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_weight_1st_deriv_contracted(device_data, alg); +} + } diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp index 604f0739..8c65c075 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -56,28 +60,49 @@ class LocalDeviceWorkDriver : public LocalWorkDriver { // Public APIs void partition_weights( XCDeviceData* ); + void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg); void eval_collocation( XCDeviceData* ); void eval_collocation_gradient( XCDeviceData* ); void eval_collocation_hessian( XCDeviceData* ); void eval_collocation_laplacian( XCDeviceData* ); + void eval_collocation_lapgrad( XCDeviceData* ); void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ); + void eval_xmat_trial( double fac, XCDeviceData*, bool do_grad, density_id den ); + void save_xmat( XCDeviceData*, bool grad, density_id den ); - void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) ; - void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) ; - void eval_uvars_mgga( XCDeviceData*, bool ) ; - void eval_vvar( XCDeviceData*, density_id, bool ) ; + void eval_uvars_lda ( XCDeviceData*, integrator_ks_scheme ) ; + void eval_uvars_gga ( XCDeviceData*, integrator_ks_scheme ) ; + void eval_uvars_mgga( XCDeviceData*, integrator_ks_scheme, bool ) ; + void eval_vvars_lda ( XCDeviceData*, density_id ) ; + void eval_vvars_gga ( XCDeviceData*, density_id ) ; + void eval_vvars_mgga( XCDeviceData*, density_id, bool ) ; + + void eval_tmat_lda ( XCDeviceData*, integrator_ks_scheme ) ; + void eval_tmat_gga ( XCDeviceData*, integrator_ks_scheme ) ; + void eval_tmat_mgga( XCDeviceData*, integrator_ks_scheme, bool ) ; + void eval_vvars_lda_trial ( XCDeviceData*, density_id ) ; + void eval_vvars_gga_trial ( XCDeviceData*, density_id ) ; + void eval_vvars_mgga_trial( XCDeviceData*, density_id, bool ) ; void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ); void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ); void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ); + void eval_kern_vxc_fxc_lda( const functional_type&, XCDeviceData* ); + void eval_kern_vxc_fxc_gga( const functional_type&, XCDeviceData* ); + void eval_kern_vxc_fxc_mgga( const functional_type&, XCDeviceData* ); void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ; void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) ; - void eval_zmat_mgga_vxc( XCDeviceData*, bool ) ; - void eval_mmat_mgga_vxc( XCDeviceData*, bool ); + void eval_zmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) ; + void eval_mmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ); + + void eval_zmat_lda_fxc( XCDeviceData*, density_id ) ; + void eval_zmat_gga_fxc( XCDeviceData*, density_id ) ; + void eval_zmat_mgga_fxc( XCDeviceData*, bool, density_id ) ; + void eval_mmat_mgga_fxc( XCDeviceData*, bool, density_id ); void eval_exx_fmat( XCDeviceData* ); void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ); @@ -85,8 +110,10 @@ class LocalDeviceWorkDriver : public LocalWorkDriver { void inc_exc( XCDeviceData* ); void inc_nel( XCDeviceData* ); void inc_vxc( XCDeviceData*, density_id, bool do_m = false ); - void inc_exc_grad_lda( XCDeviceData* ); - void inc_exc_grad_gga( XCDeviceData* ); + void inc_fxc( XCDeviceData*, density_id, bool do_m = false ); + void inc_exc_grad_lda( XCDeviceData*, integrator_ks_scheme, bool ); + void inc_exc_grad_gga( XCDeviceData*, integrator_ks_scheme, bool ); + void inc_exc_grad_mgga( XCDeviceData*, integrator_ks_scheme , bool, bool ); void inc_exx_k( XCDeviceData* ); void eval_exx_ek_screening_bfn_stats( XCDeviceData* ); @@ -94,6 +121,7 @@ class LocalDeviceWorkDriver : public LocalWorkDriver { host_task_iterator, host_task_iterator, const ShellPairCollection& ); void symmetrize_vxc( XCDeviceData*, density_id ); + void symmetrize_fxc( XCDeviceData*, density_id ); void symmetrize_exx_k( XCDeviceData* ); std::unique_ptr create_device_data(const DeviceRuntimeEnvironment&); diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx index dc5c0e04..26620277 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp index f43dd12c..f7178a8f 100644 --- a/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp +++ b/src/xc_integrator/local_work_driver/device/local_device_work_driver_pimpl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -30,34 +34,58 @@ struct LocalDeviceWorkDriverPIMPL { // Public APIs virtual void partition_weights( XCDeviceData* ) = 0; + virtual void eval_weight_1st_deriv_contracted( XCDeviceData*, XCWeightAlg ) = 0; virtual void eval_collocation( XCDeviceData* ) = 0; virtual void eval_collocation_gradient( XCDeviceData* ) = 0; virtual void eval_collocation_hessian( XCDeviceData* ) = 0; virtual void eval_collocation_laplacian( XCDeviceData* ) = 0; + virtual void eval_collocation_lapgrad( XCDeviceData* ) = 0; virtual void eval_xmat( double fac, XCDeviceData*, bool do_grad, density_id den ) = 0; + virtual void save_xmat( XCDeviceData*, bool do_grad, density_id den ) = 0; virtual void eval_exx_fmat( XCDeviceData* ) = 0; //virtual void eval_exx_gmat( XCDeviceData* ) = 0; virtual void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) = 0; virtual void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) = 0; virtual void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) = 0; - virtual void eval_uvars_mgga( XCDeviceData*, bool ) = 0; - virtual void eval_vvar( XCDeviceData*, density_id, bool ) = 0; + virtual void eval_uvars_mgga( XCDeviceData*, integrator_ks_scheme, bool ) = 0; + virtual void eval_vvars_lda ( XCDeviceData*, density_id ) = 0; + virtual void eval_vvars_gga ( XCDeviceData*, density_id ) = 0; + virtual void eval_vvars_mgga( XCDeviceData*, density_id, bool ) = 0; virtual void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) = 0; virtual void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) = 0; virtual void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ) = 0; + virtual void eval_kern_vxc_fxc_lda( const functional_type&, XCDeviceData* ) = 0; + virtual void eval_kern_vxc_fxc_gga( const functional_type&, XCDeviceData* ) = 0; + virtual void eval_kern_vxc_fxc_mgga( const functional_type&, XCDeviceData* ) = 0; virtual void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0; virtual void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) = 0; - virtual void eval_zmat_mgga_vxc( XCDeviceData*, bool ) = 0; - virtual void eval_mmat_mgga_vxc( XCDeviceData*, bool ) = 0; + virtual void eval_zmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) = 0; + virtual void eval_mmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) = 0; + virtual void eval_zmat_lda_fxc( XCDeviceData*, density_id ) = 0; + virtual void eval_zmat_gga_fxc( XCDeviceData*, density_id ) = 0; + virtual void eval_zmat_mgga_fxc( XCDeviceData*, bool, density_id ) = 0; + virtual void eval_mmat_mgga_fxc( XCDeviceData*, bool, density_id ) = 0; virtual void inc_exc( XCDeviceData* ) = 0; virtual void inc_nel( XCDeviceData* ) = 0; virtual void inc_vxc( XCDeviceData* , density_id, bool) = 0; - virtual void inc_exc_grad_lda( XCDeviceData* ) = 0; - virtual void inc_exc_grad_gga( XCDeviceData* ) = 0; + virtual void inc_fxc( XCDeviceData* , density_id, bool) = 0; + virtual void inc_exc_grad_lda( XCDeviceData*, integrator_ks_scheme, bool ) = 0; + virtual void inc_exc_grad_gga( XCDeviceData*, integrator_ks_scheme, bool ) = 0; + virtual void inc_exc_grad_mgga( XCDeviceData*, integrator_ks_scheme , bool, bool ) = 0; virtual void inc_exx_k( XCDeviceData* ) = 0; virtual void symmetrize_vxc( XCDeviceData*, density_id ) = 0; + virtual void symmetrize_fxc( XCDeviceData*, density_id ) = 0; virtual void symmetrize_exx_k( XCDeviceData* ) = 0; + //second derivative + virtual void eval_xmat_trial( double fac, XCDeviceData*, bool do_grad, density_id den ) = 0; + virtual void eval_tmat_lda( XCDeviceData*, integrator_ks_scheme ) = 0; + virtual void eval_tmat_gga( XCDeviceData*, integrator_ks_scheme ) = 0; + virtual void eval_tmat_mgga( XCDeviceData*, integrator_ks_scheme, bool ) = 0; + virtual void eval_vvars_lda_trial ( XCDeviceData*, density_id ) = 0; + virtual void eval_vvars_gga_trial ( XCDeviceData*, density_id ) = 0; + virtual void eval_vvars_mgga_trial( XCDeviceData*, density_id, bool ) = 0; + virtual void eval_exx_ek_screening_bfn_stats( XCDeviceData* ) = 0; virtual void exx_ek_shellpair_collision( double eps_E, double eps_K, XCDeviceData*, host_task_iterator, host_task_iterator, diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx index 5ffa8443..d2801307 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx @@ -1,12 +1,17 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #include "scheme1_base.hpp" #include "device/common/zmat_vxc.hpp" +#include "device/common/zmat_fxc.hpp" #include "device/common/collocation_device.hpp" #include "device/common/device_blas.hpp" #include "device/common/xc_functional_eval_wrapper.hpp" @@ -46,6 +51,7 @@ namespace XGPU { cudaStream_t stream); void integral_1_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpairs, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -60,6 +66,7 @@ namespace XGPU { cudaStream_t stream); void integral_2_task_batched( + bool sph, size_t ntasks, size_t nsubtask, int max_primpairs, size_t max_nsp, GauXC::XCDeviceTask* device_tasks, @@ -98,6 +105,7 @@ namespace XGPU { cudaStream_t stream); void integral_1_1_task_batched( + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -121,6 +129,7 @@ namespace XGPU { cudaStream_t stream); void integral_2_2_task_batched( + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -145,6 +154,7 @@ namespace XGPU { void integral_1_0_task_batched( bool swap, + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -170,6 +180,7 @@ namespace XGPU { void integral_2_0_task_batched( bool swap, + bool sph, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -195,6 +206,7 @@ namespace XGPU { void integral_2_1_task_batched( bool swap, + bool sph_2, bool sph_1, size_t ntasks, size_t nsubtasks, int max_primpairs, size_t max_nsp, @@ -279,7 +291,7 @@ void AoSScheme1Base::eval_zmat_gga_vxc( XCDeviceData* _data, integrator_ks_schem data->device_backend_->check_error("zmat_gga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){ +void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl, density_id id){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -296,14 +308,80 @@ void AoSScheme1Base::eval_zmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){ auto aos_stack = data->aos_stack; zmat_mgga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, - do_lapl, data->device_backend_->queue() ); + do_lapl, scheme, id, data->device_backend_->queue() ); data->device_backend_->check_error("zmat_mgga" __FILE__ ": " + std::to_string(__LINE__)); } +void AoSScheme1Base::eval_zmat_lda_fxc( XCDeviceData* _data, density_id den ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + auto aos_stack = data->aos_stack; + zmat_lda_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, den, + data->device_backend_->queue() ); + + data->device_backend_->check_error("zmat_lda_fxc" __FILE__ ": " + std::to_string(__LINE__)); +} + +void AoSScheme1Base::eval_zmat_gga_fxc( XCDeviceData* _data, density_id den ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + auto aos_stack = data->aos_stack; + zmat_gga_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, den, + data->device_backend_->queue() ); + + data->device_backend_->check_error("zmat_gga_fxc" __FILE__ ": " + std::to_string(__LINE__)); +} + +void AoSScheme1Base::eval_zmat_mgga_fxc( XCDeviceData* _data, bool do_lapl, density_id id){ + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + auto aos_stack = data->aos_stack; + zmat_mgga_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, + do_lapl, id, data->device_backend_->queue() ); + -void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){ + data->device_backend_->check_error("zmat_mgga_fxc" __FILE__ ": " + std::to_string(__LINE__)); +} + +void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl, density_id id){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -320,12 +398,35 @@ void AoSScheme1Base::eval_mmat_mgga_vxc( XCDeviceData* _data, bool do_lapl){ auto aos_stack = data->aos_stack; mmat_mgga_vxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, - do_lapl, data->device_backend_->queue() ); + do_lapl, scheme, id, data->device_backend_->queue() ); data->device_backend_->check_error("mmat_mgga" __FILE__ ": " + std::to_string(__LINE__)); } +void AoSScheme1Base::eval_mmat_mgga_fxc( XCDeviceData* _data, bool do_lapl, density_id id){ + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + auto aos_stack = data->aos_stack; + mmat_mgga_fxc( ntasks, nbe_max, npts_max, aos_stack.device_tasks, + do_lapl, id, data->device_backend_->queue() ); + + + data->device_backend_->check_error("mmat_mgga_fxc" __FILE__ ": " + std::to_string(__LINE__)); +} + void AoSScheme1Base::eval_collocation( XCDeviceData* _data ) { auto* data = dynamic_cast(_data); @@ -387,7 +488,7 @@ void AoSScheme1Base::eval_collocation_gradient( XCDeviceData* _data ) { data->device_backend_->queue() ); #endif - data->device_backend_->check_error("collocation grad" __FILE__ ": " + std::to_string(__LINE__)); + data->device_backend_->check_error("collocation grad " __FILE__ ": " + std::to_string(__LINE__)); } void AoSScheme1Base::eval_collocation_hessian( XCDeviceData* _data ) { @@ -430,6 +531,26 @@ void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) { data->device_backend_->check_error("collocation lapl" __FILE__ ": " + std::to_string(__LINE__)); } +void AoSScheme1Base::eval_collocation_lapgrad( XCDeviceData* _data ) { +#ifdef GAUXC_HAS_HIP + GAUXC_GENERIC_EXCEPTION("Laplacian Gradient NYI for HIP Backends"); +#else + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto aos_stack = data->aos_stack; + + auto max_l = data->l_batched_shell_to_task.size() - 1; + eval_collocation_shell_to_task_lapgrad( max_l, + data->l_batched_shell_to_task.data(), aos_stack.device_tasks, + data->device_backend_->queue() ); +#endif + + data->device_backend_->check_error("collocation lap grad " __FILE__ ": " + std::to_string(__LINE__)); +} + @@ -537,7 +658,7 @@ void AoSScheme1Base::eval_uvars_gga( XCDeviceData* _data, integrator_ks_scheme k data->device_backend_->check_error("uvvar gga" __FILE__ ": " + std::to_string(__LINE__)); } -void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){ +void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -548,29 +669,21 @@ void AoSScheme1Base::eval_uvars_mgga( XCDeviceData* _data, bool do_lapl ){ const auto ntasks = tasks.size(); size_t nbe_max = 0, npts_max = 0; for( auto& task : tasks ) { - nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); npts_max = std::max( npts_max, task.npts ); } - // Zero tau auto base_stack = data->base_stack; - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.tau_eval_device, "Tau Zero" ); - if(do_lapl) { - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.den_lapl_eval_device, "DenLapl Zero" ); - } - - - // Evaluate U variables + + // Evaluate U variable auto aos_stack = data->aos_stack; - GauXC::eval_uvars_mgga( ntasks, data->total_npts_task_batch, nbe_max, npts_max, do_lapl, + GauXC::eval_uvars_mgga( ntasks, npts_max, scheme, do_lapl, aos_stack.device_tasks, data->device_backend_->queue() ); data->device_backend_->check_error("uvvar mgga" __FILE__ ": " + std::to_string(__LINE__)); } - -void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool do_grad){ +void AoSScheme1Base::eval_vvars_lda( XCDeviceData* _data, density_id den_select){ auto* data = dynamic_cast(_data); if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -587,254 +700,908 @@ void AoSScheme1Base::eval_vvar( XCDeviceData* _data, density_id den_select, bool // Zero density auto base_stack = data->base_stack; double* den_eval_ptr = nullptr; - double* den_x_eval_ptr = nullptr; - double* den_y_eval_ptr = nullptr; - double* den_z_eval_ptr = nullptr; switch ( den_select ) { case DEN_S: den_eval_ptr = base_stack.den_s_eval_device; - if (do_grad) { den_x_eval_ptr = base_stack.dden_sx_eval_device; - den_y_eval_ptr = base_stack.dden_sy_eval_device; - den_z_eval_ptr = base_stack.dden_sz_eval_device; } break; case DEN_Z: den_eval_ptr = base_stack.den_z_eval_device; - if (do_grad) { den_x_eval_ptr = base_stack.dden_zx_eval_device; - den_y_eval_ptr = base_stack.dden_zy_eval_device; - den_z_eval_ptr = base_stack.dden_zz_eval_device; } break; case DEN_Y: den_eval_ptr = base_stack.den_y_eval_device; - if (do_grad) { den_x_eval_ptr = base_stack.dden_yx_eval_device; - den_y_eval_ptr = base_stack.dden_yy_eval_device; - den_z_eval_ptr = base_stack.dden_yz_eval_device; } break; case DEN_X: den_eval_ptr = base_stack.den_x_eval_device; - if (do_grad) { den_x_eval_ptr = base_stack.dden_xx_eval_device; - den_y_eval_ptr = base_stack.dden_xy_eval_device; - den_z_eval_ptr = base_stack.dden_xz_eval_device; } break; default: - GAUXC_GENERIC_EXCEPTION( "eval_vvar called with invalid density selected!" ); + GAUXC_GENERIC_EXCEPTION( "eval_vvars_lda called with invalid density selected!" ); } data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); - if (do_grad) { - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" ); - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" ); - } - // Evaluate V variable auto aos_stack = data->aos_stack; - GauXC::eval_vvar( ntasks, nbe_max, npts_max, do_grad, den_select, + GauXC::eval_vvars_lda( ntasks, nbe_max, npts_max, den_select, aos_stack.device_tasks, data->device_backend_->queue() ); } - -void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, - XCDeviceData* _data ) { - +void AoSScheme1Base::eval_vvars_gga( XCDeviceData* _data, density_id den_select){ auto* data = dynamic_cast(_data); - if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!"); + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + // Zero density auto base_stack = data->base_stack; + double* den_eval_ptr = nullptr; + double* den_x_eval_ptr = nullptr; + double* den_y_eval_ptr = nullptr; + double* den_z_eval_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + den_eval_ptr = base_stack.den_s_eval_device; + den_x_eval_ptr = base_stack.dden_sx_eval_device; + den_y_eval_ptr = base_stack.dden_sy_eval_device; + den_z_eval_ptr = base_stack.dden_sz_eval_device; + break; + case DEN_Z: + den_eval_ptr = base_stack.den_z_eval_device; + den_x_eval_ptr = base_stack.dden_zx_eval_device; + den_y_eval_ptr = base_stack.dden_zy_eval_device; + den_z_eval_ptr = base_stack.dden_zz_eval_device; + break; + case DEN_Y: + den_eval_ptr = base_stack.den_y_eval_device; + den_x_eval_ptr = base_stack.dden_yx_eval_device; + den_y_eval_ptr = base_stack.dden_yy_eval_device; + den_z_eval_ptr = base_stack.dden_yz_eval_device; + break; + case DEN_X: + den_eval_ptr = base_stack.den_x_eval_device; + den_x_eval_ptr = base_stack.dden_xx_eval_device; + den_y_eval_ptr = base_stack.dden_xy_eval_device; + den_z_eval_ptr = base_stack.dden_xz_eval_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga called with invalid density selected!" ); + } - const bool is_RKS = data->allocated_terms.ks_scheme == RKS; - const bool is_UKS = data->allocated_terms.ks_scheme == UKS; - const bool is_GKS = data->allocated_terms.ks_scheme == GKS; - const bool is_pol = is_UKS or is_GKS; - const bool is_excgrad = data->allocated_terms.exc_grad; - - const size_t npts = data->total_npts_task_batch ; + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" ); - auto* dep = base_stack.den_s_eval_device; + // Evaluate V variable + auto aos_stack = data->aos_stack; + GauXC::eval_vvars_gga( ntasks, nbe_max, npts_max, den_select, + aos_stack.device_tasks, data->device_backend_->queue() ); - if ( is_pol ) { - dep = base_stack.den_eval_device; - // Interleave pos/neg densities before passing it to ExchCXX - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device , 2, "den_s -> den_eval" ); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" ); - } +} - GauXC::eval_kern_exc_vxc_lda( func, npts, - dep, base_stack.eps_eval_device, - base_stack.vrho_eval_device, data->device_backend_->queue() ); +void AoSScheme1Base::eval_vvars_mgga( XCDeviceData* _data, density_id den_select, bool need_lapl){ + auto* data = dynamic_cast(_data); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.eps_eval_device, 1 ); + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - if( not is_pol ) { - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 ); + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); } - else if( is_pol ) { - // De-interleave pos/neg densities - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vrho_eval_device , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" ); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" ); - - // Weight results point-by-point - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 ); + + // Zero density + auto base_stack = data->base_stack; + double* den_eval_ptr = nullptr; + double* den_x_eval_ptr = nullptr; + double* den_y_eval_ptr = nullptr; + double* den_z_eval_ptr = nullptr; + double* tau_eval_ptr = nullptr; + double* lapl_eval_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + den_eval_ptr = base_stack.den_s_eval_device; + den_x_eval_ptr = base_stack.dden_sx_eval_device; + den_y_eval_ptr = base_stack.dden_sy_eval_device; + den_z_eval_ptr = base_stack.dden_sz_eval_device; + tau_eval_ptr = base_stack.tau_s_eval_device; + lapl_eval_ptr = base_stack.lapl_s_eval_device; + break; + case DEN_Z: + den_eval_ptr = base_stack.den_z_eval_device; + den_x_eval_ptr = base_stack.dden_zx_eval_device; + den_y_eval_ptr = base_stack.dden_zy_eval_device; + den_z_eval_ptr = base_stack.dden_zz_eval_device; + tau_eval_ptr = base_stack.tau_z_eval_device; + lapl_eval_ptr = base_stack.lapl_z_eval_device; + break; + case DEN_Y: + den_eval_ptr = base_stack.den_y_eval_device; + den_x_eval_ptr = base_stack.dden_yx_eval_device; + den_y_eval_ptr = base_stack.dden_yy_eval_device; + den_z_eval_ptr = base_stack.dden_yz_eval_device; + break; + case DEN_X: + den_eval_ptr = base_stack.den_x_eval_device; + den_x_eval_ptr = base_stack.dden_xx_eval_device; + den_y_eval_ptr = base_stack.dden_xy_eval_device; + den_z_eval_ptr = base_stack.dden_xz_eval_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga called with invalid density selected!" ); } + + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" ); + if(tau_eval_ptr) + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, tau_eval_ptr, "TAU Zero"); + if(lapl_eval_ptr) + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, lapl_eval_ptr, "LAPL Zero"); - data->device_backend_->check_error("exc_vxc lda" __FILE__ ": " + std::to_string(__LINE__)); -} + // Evaluate V variable + auto aos_stack = data->aos_stack; + GauXC::eval_vvars_mgga( ntasks, nbe_max, npts_max, den_select, need_lapl, + aos_stack.device_tasks, data->device_backend_->queue() ); +} -void AoSScheme1Base::eval_kern_exc_vxc_gga( const functional_type& func, - XCDeviceData* _data ) { +void AoSScheme1Base::eval_tmat_lda( XCDeviceData* _data, integrator_ks_scheme ks_scheme){ auto* data = dynamic_cast(_data); - if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!"); + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + npts_max = std::max( npts_max, task.npts ); + } auto base_stack = data->base_stack; - double* den_eval_ptr = base_stack.den_s_eval_device; - const bool is_RKS = data->allocated_terms.ks_scheme == RKS; - const bool is_UKS = data->allocated_terms.ks_scheme == UKS; - const bool is_GKS = data->allocated_terms.ks_scheme == GKS; - const bool is_pol = is_UKS or is_GKS; - const bool is_excgrad = data->allocated_terms.exc_grad; + // Evaluate U variables + auto aos_stack = data->aos_stack; + GauXC::eval_tmat_lda( ntasks, npts_max, ks_scheme, + aos_stack.device_tasks, data->device_backend_->queue() ); - const size_t npts = data->total_npts_task_batch ; - + data->device_backend_->check_error("uvvar lda trial" __FILE__ ": " + std::to_string(__LINE__)); +} - if ( is_pol ) { - den_eval_ptr = base_stack.den_eval_device; - // Interleave pos/neg densities before passing it to ExchCXX - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.den_s_eval_device, 1, base_stack.den_eval_device , 2, "den_s -> den_eval" ); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.den_z_eval_device, 1, base_stack.den_eval_device+1, 2, "den_z -> den_eval" ); - // Interleave gamma pp, pm, mm - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.gamma_pp_eval_device, 1, base_stack.gamma_eval_device , 3, "gamma_pp -> gamma_eval"); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.gamma_pm_eval_device, 1, base_stack.gamma_eval_device+1, 3, "gamma_pm -> gamma_eval"); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.gamma_mm_eval_device, 1, base_stack.gamma_eval_device+2, 3, "gamma_mm -> gamma_eval"); - } - - GauXC::eval_kern_exc_vxc_gga( func, data->total_npts_task_batch, - den_eval_ptr, base_stack.gamma_eval_device, - base_stack.eps_eval_device, base_stack.vrho_eval_device, - base_stack.vgamma_eval_device, data->device_backend_->queue() ); - +void AoSScheme1Base::eval_tmat_gga( XCDeviceData* _data, integrator_ks_scheme ks_scheme){ + auto* data = dynamic_cast(_data); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.eps_eval_device, 1 ); + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - if( not is_pol ) { - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vgamma_eval_device, 1 ); - } - else if( is_pol ) { - // De-interleave pos/neg densities - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vrho_eval_device , 2, base_stack.vrho_pos_eval_device, 1, "vrho->vrho_pos" ); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, "vrho->vrho_pos" ); - - // Multiply by weights point-by-point - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_pos_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_neg_eval_device, 1 ); - - // De-interleave vgamma - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vgamma_eval_device , 3, base_stack.vgamma_pp_eval_device, 1, "vgamma_eval -> vgamma_pp" ); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vgamma_eval_device+1, 3, base_stack.vgamma_pm_eval_device, 1, "vgamma_eval -> vgamma_pm" ); - data->device_backend_-> - copy_async_2d( 1, npts, base_stack.vgamma_eval_device+2, 3, base_stack.vgamma_mm_eval_device, 1, "vgamma_eval -> vgamma_mm" ); - - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vgamma_pp_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vgamma_pm_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vgamma_mm_eval_device, 1 ); - - + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + npts_max = std::max( npts_max, task.npts ); } + auto base_stack = data->base_stack; + + // Evaluate U variable + auto aos_stack = data->aos_stack; + GauXC::eval_tmat_gga( ntasks, npts_max, ks_scheme, + aos_stack.device_tasks, data->device_backend_->queue() ); - data->device_backend_->check_error("exc_vxc gga" __FILE__ ": " + std::to_string(__LINE__)); + data->device_backend_->check_error("uvvar gga trial" __FILE__ ": " + std::to_string(__LINE__)); } - -void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func, - XCDeviceData* _data ) { +void AoSScheme1Base::eval_tmat_mgga( XCDeviceData* _data, integrator_ks_scheme scheme, bool do_lapl ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - if( !func.is_mgga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!"); + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + npts_max = std::max( npts_max, task.npts ); + } auto base_stack = data->base_stack; + + // Evaluate U variable + auto aos_stack = data->aos_stack; + GauXC::eval_tmat_mgga( ntasks, npts_max, scheme, do_lapl, + aos_stack.device_tasks, data->device_backend_->queue() ); - if(func.needs_laplacian()) { - data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, base_stack.vlapl_eval_device, "VLapl Zero" ); - } + + data->device_backend_->check_error("uvvar mgga trial" __FILE__ ": " + std::to_string(__LINE__)); +} + +void AoSScheme1Base::eval_vvars_lda_trial( XCDeviceData* _data, density_id den_select){ + auto* data = dynamic_cast(_data); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + // Zero density + auto base_stack = data->base_stack; + double* den_eval_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + den_eval_ptr = base_stack.tden_s_eval_device; + break; + case DEN_Z: + den_eval_ptr = base_stack.tden_z_eval_device; + break; + case DEN_Y: + den_eval_ptr = base_stack.tden_y_eval_device; + break; + case DEN_X: + den_eval_ptr = base_stack.tden_x_eval_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvars_lda_trial called with invalid density selected!" ); + } + + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); + + // Evaluate V variable + auto aos_stack = data->aos_stack; + GauXC::eval_vvars_lda_trial( ntasks, nbe_max, npts_max, den_select, + aos_stack.device_tasks, data->device_backend_->queue() ); + +} + +void AoSScheme1Base::eval_vvars_gga_trial( XCDeviceData* _data, density_id den_select){ + auto* data = dynamic_cast(_data); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + // Zero density + auto base_stack = data->base_stack; + double* den_eval_ptr = nullptr; + double* den_x_eval_ptr = nullptr; + double* den_y_eval_ptr = nullptr; + double* den_z_eval_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + den_eval_ptr = base_stack.tden_s_eval_device; + den_x_eval_ptr = base_stack.tdden_sx_eval_device; + den_y_eval_ptr = base_stack.tdden_sy_eval_device; + den_z_eval_ptr = base_stack.tdden_sz_eval_device; + break; + case DEN_Z: + den_eval_ptr = base_stack.tden_z_eval_device; + den_x_eval_ptr = base_stack.tdden_zx_eval_device; + den_y_eval_ptr = base_stack.tdden_zy_eval_device; + den_z_eval_ptr = base_stack.tdden_zz_eval_device; + break; + case DEN_Y: + den_eval_ptr = base_stack.tden_y_eval_device; + den_x_eval_ptr = base_stack.tdden_yx_eval_device; + den_y_eval_ptr = base_stack.tdden_yy_eval_device; + den_z_eval_ptr = base_stack.tdden_yz_eval_device; + break; + case DEN_X: + den_eval_ptr = base_stack.tden_x_eval_device; + den_x_eval_ptr = base_stack.tdden_xx_eval_device; + den_y_eval_ptr = base_stack.tdden_xy_eval_device; + den_z_eval_ptr = base_stack.tdden_xz_eval_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga_trial called with invalid density selected!" ); + } + + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" ); + + // Evaluate V variable + auto aos_stack = data->aos_stack; + GauXC::eval_vvars_gga_trial( ntasks, nbe_max, npts_max, den_select, + aos_stack.device_tasks, data->device_backend_->queue() ); + +} + +void AoSScheme1Base::eval_vvars_mgga_trial( XCDeviceData* _data, density_id den_select, bool need_lapl){ + auto* data = dynamic_cast(_data); + if ( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + auto& tasks = data->host_device_tasks; + const auto ntasks = tasks.size(); + size_t nbe_max = 0, npts_max = 0; + for( auto& task : tasks ) { + nbe_max = std::max( nbe_max, task.bfn_screening.nbe ); + npts_max = std::max( npts_max, task.npts ); + } + + // Zero density + auto base_stack = data->base_stack; + double* den_eval_ptr = nullptr; + double* den_x_eval_ptr = nullptr; + double* den_y_eval_ptr = nullptr; + double* den_z_eval_ptr = nullptr; + double* tau_eval_ptr = nullptr; + double* lapl_eval_ptr = nullptr; + switch ( den_select ) { + case DEN_S: + den_eval_ptr = base_stack.tden_s_eval_device; + den_x_eval_ptr = base_stack.tdden_sx_eval_device; + den_y_eval_ptr = base_stack.tdden_sy_eval_device; + den_z_eval_ptr = base_stack.tdden_sz_eval_device; + tau_eval_ptr = base_stack.ttau_s_eval_device; + lapl_eval_ptr = base_stack.tlapl_s_eval_device; + break; + case DEN_Z: + den_eval_ptr = base_stack.tden_z_eval_device; + den_x_eval_ptr = base_stack.tdden_zx_eval_device; + den_y_eval_ptr = base_stack.tdden_zy_eval_device; + den_z_eval_ptr = base_stack.tdden_zz_eval_device; + tau_eval_ptr = base_stack.ttau_z_eval_device; + lapl_eval_ptr = base_stack.tlapl_z_eval_device; + break; + case DEN_Y: + den_eval_ptr = base_stack.tden_y_eval_device; + den_x_eval_ptr = base_stack.tdden_yx_eval_device; + den_y_eval_ptr = base_stack.tdden_yy_eval_device; + den_z_eval_ptr = base_stack.tdden_yz_eval_device; + break; + case DEN_X: + den_eval_ptr = base_stack.tden_x_eval_device; + den_x_eval_ptr = base_stack.tdden_xx_eval_device; + den_y_eval_ptr = base_stack.tdden_xy_eval_device; + den_z_eval_ptr = base_stack.tdden_xz_eval_device; + break; + default: + GAUXC_GENERIC_EXCEPTION( "eval_vvars_gga_trial called with invalid density selected!" ); + } + + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_x_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_y_eval_ptr, "Den Zero" ); + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, den_z_eval_ptr, "Den Zero" ); + if(tau_eval_ptr) + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, tau_eval_ptr, "TAU Zero"); + if(lapl_eval_ptr) + data->device_backend_->set_zero_async_master_queue( data->total_npts_task_batch, lapl_eval_ptr, "LAPL Zero"); + + // Evaluate V variable + auto aos_stack = data->aos_stack; + GauXC::eval_vvars_mgga_trial( ntasks, nbe_max, npts_max, den_select, need_lapl, + aos_stack.device_tasks, data->device_backend_->queue() ); + +} + + +template +void interleave_kernel_input(size_t len, const T* src_data, int src_stride, T* tgt_data, int tgt_stride, std::string msg, + DeviceBackend* backend) { + backend->copy_async_2d(1, len, src_data, src_stride, tgt_data, tgt_stride, msg); +} + +template +void interleave_lda_input(size_t npts, T& base_stack, DeviceBackend* backend) { + interleave_kernel_input(npts, base_stack.den_s_eval_device, 1, base_stack.den_interleaved_device+0, 2, + "den_+ - > den_interleaved", backend); + interleave_kernel_input(npts, base_stack.den_z_eval_device, 1, base_stack.den_interleaved_device+1, 2, + "den_- - > den_interleaved", backend); +} + +template +void interleave_gga_input(size_t npts, T& base_stack, DeviceBackend* backend) { + interleave_lda_input(npts, base_stack, backend); + interleave_kernel_input(npts, base_stack.gamma_pp_eval_device, 1, base_stack.gamma_eval_device+0, 3, + "gamma_++ - > gamma_interleaved", backend); + interleave_kernel_input(npts, base_stack.gamma_pm_eval_device, 1, base_stack.gamma_eval_device+1, 3, + "gamma_+- - > gamma_interleaved", backend); + interleave_kernel_input(npts, base_stack.gamma_mm_eval_device, 1, base_stack.gamma_eval_device+2, 3, + "gamma_-- - > gamma_interleaved", backend); +} + +template +void interleave_mgga_input(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl) { + interleave_gga_input(npts, base_stack, backend); + interleave_kernel_input(npts, base_stack.tau_s_eval_device, 1, base_stack.tau_interleaved_device, 2, + "tau_+ - > tau_interleaved", backend); + interleave_kernel_input(npts, base_stack.tau_z_eval_device, 1, base_stack.tau_interleaved_device+1, 2, + "tau_- - > tau_interleaved", backend); + if(need_lapl) { + interleave_kernel_input(npts, base_stack.lapl_s_eval_device, 1, base_stack.lapl_interleaved_device, 2, + "lapl_+ - > lapl_interleaved", backend); + interleave_kernel_input(npts, base_stack.lapl_z_eval_device, 1, base_stack.lapl_interleaved_device+1, 2, + "lapl_- - > lapl_interleaved", backend); + } +} + + + +template +void deinterleave_lda_output(size_t npts, T& base_stack, DeviceBackend* backend) { + interleave_kernel_input(npts, base_stack.vrho_eval_device+0, 2, base_stack.vrho_pos_eval_device, 1, + "vrho -> vrho+", backend); + interleave_kernel_input(npts, base_stack.vrho_eval_device+1, 2, base_stack.vrho_neg_eval_device, 1, + "vrho -> vrho-", backend); +} + +template +void deinterleave_gga_output(size_t npts, T& base_stack, DeviceBackend* backend) { + deinterleave_lda_output(npts, base_stack, backend); + interleave_kernel_input(npts, base_stack.vgamma_eval_device+0, 3, base_stack.vgamma_pp_eval_device, 1, + "vgamma -> vgamma++", backend); + interleave_kernel_input(npts, base_stack.vgamma_eval_device+1, 3, base_stack.vgamma_pm_eval_device, 1, + "vgamma -> vgamma+-", backend); + interleave_kernel_input(npts, base_stack.vgamma_eval_device+2, 3, base_stack.vgamma_mm_eval_device, 1, + "vgamma -> vgamma--", backend); +} + +template +void deinterleave_mgga_output(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl) { + deinterleave_gga_output(npts, base_stack, backend); + interleave_kernel_input(npts, base_stack.vtau_eval_device+0, 2, base_stack.vtau_pos_eval_device, 1, + "vtau -> vtau+", backend); + interleave_kernel_input(npts, base_stack.vtau_eval_device+1, 2, base_stack.vtau_neg_eval_device, 1, + "vtau -> vtau-", backend); + if(need_lapl) { + interleave_kernel_input(npts, base_stack.vlapl_eval_device+0, 2, base_stack.vlapl_pos_eval_device, 1, + "vlapl -> vlapl+", backend); + interleave_kernel_input(npts, base_stack.vlapl_eval_device+1, 2, base_stack.vlapl_neg_eval_device, 1, + "vlapl -> vlapl-", backend); + } +} + +template +void deinterleave_vxc_fxc_lda(size_t npts, T& base_stack, DeviceBackend* backend) { + // Deinterleave the lda vxc output + deinterleave_lda_output(npts, base_stack, backend); + interleave_kernel_input(npts, base_stack.v2rho2_eval_device+0, 3, base_stack.v2rho2_a_a_eval_device, 1, + "v2rho2 -> v2rho2_aa", backend); + interleave_kernel_input(npts, base_stack.v2rho2_eval_device+1, 3, base_stack.v2rho2_a_b_eval_device, 1, + "v2rho2 -> v2rho2_ab", backend); + interleave_kernel_input(npts, base_stack.v2rho2_eval_device+2, 3, base_stack.v2rho2_b_b_eval_device, 1, + "v2rho2 -> v2rho2_bb", backend); +} + +template +void deinterleave_vxc_fxc_gga(size_t npts, T& base_stack, DeviceBackend* backend) { + deinterleave_vxc_fxc_lda(npts, base_stack, backend); + // Deinterleave the gga vxc output + deinterleave_gga_output(npts, base_stack, backend); + + interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+0, 6, base_stack.v2rhogamma_a_aa_eval_device, 1, + "v2rhogamma -> v2rhogamma_a_aa", backend); + interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+1, 6, base_stack.v2rhogamma_a_ab_eval_device, 1, + "v2rhogamma -> v2rhogamma_a_ab", backend); + interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+2, 6, base_stack.v2rhogamma_a_bb_eval_device, 1, + "v2rhogamma -> v2rhogamma_a_bb", backend); + interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+3, 6, base_stack.v2rhogamma_b_aa_eval_device, 1, + "v2rhogamma -> v2rhogamma_b_aa", backend); + interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+4, 6, base_stack.v2rhogamma_b_ab_eval_device, 1, + "v2rhogamma -> v2rhogamma_b_ab", backend); + interleave_kernel_input(npts, base_stack.v2rhogamma_eval_device+5, 6, base_stack.v2rhogamma_b_bb_eval_device, 1, + "v2rhogamma -> v2rhogamma_b_bb", backend); + interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+0, 6, base_stack.v2gamma2_aa_aa_eval_device, 1, + "v2gamma2 -> v2gamma2_aa_aa", backend); + interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+1, 6, base_stack.v2gamma2_aa_ab_eval_device, 1, + "v2gamma2 -> v2gamma2_aa_ab", backend); + interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+2, 6, base_stack.v2gamma2_aa_bb_eval_device, 1, + "v2gamma2 -> v2gamma2_aa_bb", backend); + interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+3, 6, base_stack.v2gamma2_ab_ab_eval_device, 1, + "v2gamma2 -> v2gamma2_ab_ab", backend); + interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+4, 6, base_stack.v2gamma2_ab_bb_eval_device, 1, + "v2gamma2 -> v2gamma2_ab_bb", backend); + interleave_kernel_input(npts, base_stack.v2gamma2_eval_device+5, 6, base_stack.v2gamma2_bb_bb_eval_device, 1, + "v2gamma2 -> v2gamma2_bb_bb", backend); +} + +template +void deinterleave_vxc_fxc_mgga(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl) { + deinterleave_vxc_fxc_gga(npts, base_stack, backend); + // Deinterleave the mgga vxc output + deinterleave_mgga_output(npts, base_stack, backend, need_lapl); + + interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+0, 4, base_stack.v2rhotau_a_a_eval_device, 1, + "v2rhotau -> v2rhotau_a_a", backend); + interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+1, 4, base_stack.v2rhotau_a_b_eval_device, 1, + "v2rhotau -> v2rhotau_a_b", backend); + interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+2, 4, base_stack.v2rhotau_b_a_eval_device, 1, + "v2rhotau -> v2rhotau_b_a", backend); + interleave_kernel_input(npts, base_stack.v2rhotau_eval_device+3, 4, base_stack.v2rhotau_b_b_eval_device, 1, + "v2rhotau -> v2rhotau_b_b", backend); + interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+0, 6, base_stack.v2gammatau_aa_a_eval_device, 1, + "v2gammatau -> v2gammatau_aa_a", backend); + interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+1, 6, base_stack.v2gammatau_aa_b_eval_device, 1, + "v2gammatau -> v2gammatau_aa_b", backend); + interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+2, 6, base_stack.v2gammatau_ab_a_eval_device, 1, + "v2gammatau -> v2gammatau_ab_a", backend); + interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+3, 6, base_stack.v2gammatau_ab_b_eval_device, 1, + "v2gammatau -> v2gammatau_ab_b", backend); + interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+4, 6, base_stack.v2gammatau_bb_a_eval_device, 1, + "v2gammatau -> v2gammatau_bb_a", backend); + interleave_kernel_input(npts, base_stack.v2gammatau_eval_device+5, 6, base_stack.v2gammatau_bb_b_eval_device, 1, + "v2gammatau -> v2gammatau_bb_b", backend); + interleave_kernel_input(npts, base_stack.v2tau2_eval_device+0, 3, base_stack.v2tau2_a_a_eval_device, 1, + "v2tau2 -> v2tau2_a_a", backend); + interleave_kernel_input(npts, base_stack.v2tau2_eval_device+1, 3, base_stack.v2tau2_a_b_eval_device, 1, + "v2tau2 -> v2tau2_a_b", backend); + interleave_kernel_input(npts, base_stack.v2tau2_eval_device+2, 3, base_stack.v2tau2_b_b_eval_device, 1, + "v2tau2 -> v2tau2_b_b", backend); + + if (need_lapl) { + interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+0, 4, base_stack.v2rholapl_a_a_eval_device, 1, + "v2rholapl -> v2rholapl_a_a", backend); + interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+1, 4, base_stack.v2rholapl_a_b_eval_device, 1, + "v2rholapl -> v2rholapl_a_b", backend); + interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+2, 4, base_stack.v2rholapl_b_a_eval_device, 1, + "v2rholapl -> v2rholapl_b_a", backend); + interleave_kernel_input(npts, base_stack.v2rholapl_eval_device+3, 4, base_stack.v2rholapl_b_b_eval_device, 1, + "v2rholapl -> v2rholapl_b_b", backend); + interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+0, 6, base_stack.v2gammalapl_aa_a_eval_device, 1, + "v2gammalapl -> v2gammalapl_aa_a", backend); + interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+1, 6, base_stack.v2gammalapl_aa_b_eval_device, 1, + "v2gammalapl -> v2gammalapl_aa_b", backend); + interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+2, 6, base_stack.v2gammalapl_ab_a_eval_device, 1, + "v2gammalapl -> v2gammalapl_ab_a", backend); + interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+3, 6, base_stack.v2gammalapl_ab_b_eval_device, 1, + "v2gammalapl -> v2gammalapl_ab_b", backend); + interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+4, 6, base_stack.v2gammalapl_bb_a_eval_device, 1, + "v2gammalapl -> v2gammalapl_bb_a", backend); + interleave_kernel_input(npts, base_stack.v2gammalapl_eval_device+5, 6, base_stack.v2gammalapl_bb_b_eval_device, 1, + "v2gammalapl -> v2gammalapl_bb_b", backend); + interleave_kernel_input(npts, base_stack.v2lapl2_eval_device+0, 3, base_stack.v2lapl2_a_a_eval_device, 1, + "v2lapl2 -> v2lapl2_a_a", backend); + interleave_kernel_input(npts, base_stack.v2lapl2_eval_device+1, 3, base_stack.v2lapl2_a_b_eval_device, 1, + "v2lapl2 -> v2lapl2_a_b", backend); + interleave_kernel_input(npts, base_stack.v2lapl2_eval_device+2, 3, base_stack.v2lapl2_b_b_eval_device, 1, + "v2lapl2 -> v2lapl2_b_b", backend); + interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+0, 4, base_stack.v2lapltau_a_a_eval_device, 1, + "v2lapltau -> v2lapltau_a_a", backend); + interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+1, 4, base_stack.v2lapltau_a_b_eval_device, 1, + "v2lapltau -> v2lapltau_a_b", backend); + interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+2, 4, base_stack.v2lapltau_b_a_eval_device, 1, + "v2lapltau -> v2lapltau_b_a", backend); + interleave_kernel_input(npts, base_stack.v2lapltau_eval_device+3, 4, base_stack.v2lapltau_b_b_eval_device, 1, + "v2lapltau -> v2lapltau_b_b", backend); + } +} + +template +void scale_lda_output(size_t npts, T& base_stack, DeviceBackend* backend, bool is_pol) { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.eps_eval_device, 1); + if(is_pol) { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vrho_pos_eval_device, 1); + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vrho_neg_eval_device, 1); + } else { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vrho_eval_device, 1); + } +} + +template +void scale_gga_output(size_t npts, T& base_stack, DeviceBackend* backend, bool is_pol) { + scale_lda_output(npts, base_stack, backend, is_pol); + if(is_pol) { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vgamma_pp_eval_device, 1); + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vgamma_pm_eval_device, 1); + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vgamma_mm_eval_device, 1); + } else { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vgamma_eval_device, 1); + } +} + +template +void scale_mgga_output(size_t npts, T& base_stack, DeviceBackend* backend, bool need_lapl, bool is_pol) { + scale_gga_output(npts, base_stack, backend, is_pol); + if(is_pol) { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vtau_pos_eval_device, 1); + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vtau_neg_eval_device, 1); + if(need_lapl) { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vlapl_pos_eval_device, 1); + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vlapl_neg_eval_device, 1); + } + } else { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vtau_eval_device, 1); + if(need_lapl) { + hadamard_product(backend->master_blas_handle(), npts, 1, base_stack.weights_device, 1, + base_stack.vlapl_eval_device, 1); + } + } +} + + +void AoSScheme1Base::eval_kern_exc_vxc_lda( const functional_type& func, + XCDeviceData* _data ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!"); + + auto base_stack = data->base_stack; + + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + const bool is_excgrad = data->allocated_terms.exc_grad; + + const size_t npts = data->total_npts_task_batch ; + + auto* den_eval_ptr = base_stack.den_s_eval_device; + + if ( is_pol ) { + den_eval_ptr = base_stack.den_interleaved_device; + interleave_lda_input(npts, base_stack, data->device_backend_); + } + + GauXC::eval_kern_exc_vxc_lda( func, npts, + den_eval_ptr, base_stack.eps_eval_device, + base_stack.vrho_eval_device, data->device_backend_->queue() ); + + if(is_pol) deinterleave_lda_output(npts, base_stack, data->device_backend_); + scale_lda_output(npts, base_stack, data->device_backend_, is_pol); + + data->device_backend_->check_error("exc_vxc lda" __FILE__ ": " + std::to_string(__LINE__)); +} + + +void AoSScheme1Base::eval_kern_exc_vxc_gga( const functional_type& func, + XCDeviceData* _data ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!"); + + auto base_stack = data->base_stack; + double* den_eval_ptr = base_stack.den_s_eval_device; + + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + const bool is_excgrad = data->allocated_terms.exc_grad; + + const size_t npts = data->total_npts_task_batch ; + + if(is_pol) { + den_eval_ptr = base_stack.den_interleaved_device; + interleave_gga_input(npts, base_stack, data->device_backend_); + } + + GauXC::eval_kern_exc_vxc_gga( func, data->total_npts_task_batch, + den_eval_ptr, base_stack.gamma_eval_device, + base_stack.eps_eval_device, base_stack.vrho_eval_device, + base_stack.vgamma_eval_device, data->device_backend_->queue() ); + + if(is_pol) deinterleave_gga_output(npts, base_stack, data->device_backend_); + scale_gga_output(npts, base_stack, data->device_backend_, is_pol); + + data->device_backend_->check_error("exc_vxc gga" __FILE__ ": " + std::to_string(__LINE__)); +} + + +void AoSScheme1Base::eval_kern_exc_vxc_mgga( const functional_type& func, + XCDeviceData* _data ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + if( !func.is_mgga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not MGGA!"); + + auto base_stack = data->base_stack; + double* den_eval_ptr = base_stack.den_s_eval_device; + double* tau_eval_ptr = base_stack.tau_s_eval_device; + double* lapl_eval_ptr = base_stack.lapl_s_eval_device; + + const bool is_RKS = data->allocated_terms.ks_scheme == RKS; + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + const bool is_excgrad = data->allocated_terms.exc_grad; + + const size_t npts = data->total_npts_task_batch ; + + if(is_pol) { + den_eval_ptr = base_stack.den_interleaved_device; + tau_eval_ptr = base_stack.tau_interleaved_device; + lapl_eval_ptr = base_stack.lapl_interleaved_device; + interleave_mgga_input(npts, base_stack, data->device_backend_, func.needs_laplacian()); + } GauXC::eval_kern_exc_vxc_mgga( func, data->total_npts_task_batch, - base_stack.den_s_eval_device, base_stack.gamma_eval_device, - base_stack.tau_eval_device, base_stack.den_lapl_eval_device, + den_eval_ptr, base_stack.gamma_eval_device, + tau_eval_ptr, lapl_eval_ptr, base_stack.eps_eval_device, base_stack.vrho_eval_device, base_stack.vgamma_eval_device, base_stack.vtau_eval_device, base_stack.vlapl_eval_device, data->device_backend_->queue() ); + + if(is_pol) deinterleave_mgga_output(npts, base_stack, data->device_backend_, func.needs_laplacian()); + scale_mgga_output(npts, base_stack, data->device_backend_, func.needs_laplacian(), is_pol); + data->device_backend_->check_error("exc_vxc mgga" __FILE__ ": " + std::to_string(__LINE__)); +} + + +void AoSScheme1Base::eval_kern_vxc_fxc_lda( const functional_type& func, + XCDeviceData* _data ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.eps_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vrho_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vgamma_eval_device, 1 ); - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vtau_eval_device, 1 ); - if(func.needs_laplacian()) { - hadamard_product( data->device_backend_->master_blas_handle(), data->total_npts_task_batch, 1, - base_stack.weights_device, 1, base_stack.vlapl_eval_device, 1 ); + if( !func.is_lda() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not LDA!"); + + auto base_stack = data->base_stack; + + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + + const size_t npts = data->total_npts_task_batch ; + + auto* den_eval_ptr = base_stack.den_s_eval_device; + + if ( is_pol ) { + den_eval_ptr = base_stack.den_interleaved_device; + interleave_lda_input(npts, base_stack, data->device_backend_); } + GauXC::eval_kern_vxc_fxc_lda( func, npts, + den_eval_ptr, base_stack.vrho_eval_device, + base_stack.v2rho2_eval_device, data->device_backend_->queue() ); + + if(is_pol) deinterleave_vxc_fxc_lda(npts, base_stack, data->device_backend_); + // For 2nd derivative, we do not scale the output + // We will multiply it with the weights to the intermediate outputs A, B, C - data->device_backend_->check_error("exc_vxc mgga" __FILE__ ": " + std::to_string(__LINE__)); + data->device_backend_->check_error("exc_vxc_fxc lda" __FILE__ ": " + std::to_string(__LINE__)); } +void AoSScheme1Base::eval_kern_vxc_fxc_gga( const functional_type& func, + XCDeviceData* _data ) { + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + if( !func.is_gga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not GGA!"); + + auto base_stack = data->base_stack; + double* den_eval_ptr = base_stack.den_s_eval_device; + + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + const size_t npts = data->total_npts_task_batch ; + + if(is_pol) { + den_eval_ptr = base_stack.den_interleaved_device; + interleave_gga_input(npts, base_stack, data->device_backend_); + } + GauXC::eval_kern_vxc_fxc_gga( func, npts, + den_eval_ptr, base_stack.gamma_eval_device, + base_stack.vrho_eval_device, base_stack.vgamma_eval_device, + base_stack.v2rho2_eval_device, base_stack.v2rhogamma_eval_device, base_stack.v2gamma2_eval_device, + data->device_backend_->queue() ); + if(is_pol) deinterleave_vxc_fxc_gga(npts, base_stack, data->device_backend_); + + // For 2nd derivative, we do not scale the output + // We will multiply it with the weights to the intermediate outputs A, B, C -void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){ + + data->device_backend_->check_error("exc_vxc_fxc gga" __FILE__ ": " + std::to_string(__LINE__)); +} + + +void AoSScheme1Base::eval_kern_vxc_fxc_mgga( const functional_type& func, + XCDeviceData* _data ) { + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + if( !func.is_mgga() ) GAUXC_GENERIC_EXCEPTION("XC Kernel not MGGA!"); + + auto base_stack = data->base_stack; + double* den_eval_ptr = base_stack.den_s_eval_device; + double* tau_eval_ptr = base_stack.tau_s_eval_device; + double* lapl_eval_ptr = base_stack.lapl_s_eval_device; + + const bool is_UKS = data->allocated_terms.ks_scheme == UKS; + const bool is_GKS = data->allocated_terms.ks_scheme == GKS; + const bool is_pol = is_UKS or is_GKS; + + const size_t npts = data->total_npts_task_batch ; + + if(is_pol) { + den_eval_ptr = base_stack.den_interleaved_device; + tau_eval_ptr = base_stack.tau_interleaved_device; + lapl_eval_ptr = base_stack.lapl_interleaved_device; + interleave_mgga_input(npts, base_stack, data->device_backend_, func.needs_laplacian()); + } + + GauXC::eval_kern_vxc_fxc_mgga( func, npts, + den_eval_ptr, base_stack.gamma_eval_device, + lapl_eval_ptr, tau_eval_ptr, + base_stack.vrho_eval_device, base_stack.vgamma_eval_device, + base_stack.vlapl_eval_device, base_stack.vtau_eval_device, + base_stack.v2rho2_eval_device, base_stack.v2rhogamma_eval_device, + base_stack.v2rholapl_eval_device, base_stack.v2rhotau_eval_device, + base_stack.v2gamma2_eval_device, base_stack.v2gammalapl_eval_device, + base_stack.v2gammatau_eval_device, base_stack.v2lapl2_eval_device, + base_stack.v2lapltau_eval_device, base_stack.v2tau2_eval_device, + data->device_backend_->queue() ); + + if(is_pol) deinterleave_vxc_fxc_mgga(npts, base_stack, data->device_backend_, func.needs_laplacian()); + + // For 2nd derivative, we do not scale the output + // We will multiply it with the weights to the intermediate outputs A, B, C + + data->device_backend_->check_error("exc_vxc_fxc mgga" __FILE__ ": " + std::to_string(__LINE__)); +} + +template +void AoSScheme1Base::eval_xmat_impl( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -849,22 +1616,12 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, d const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - double* dmat_ptr = nullptr; - switch ( den_select ) { - case DEN_S: - dmat_ptr = static_stack.dmat_s_device; - break; - case DEN_Z: - dmat_ptr = static_stack.dmat_z_device; - break; - case DEN_X: - dmat_ptr = static_stack.dmat_x_device; - break; - case DEN_Y: - dmat_ptr = static_stack.dmat_y_device; - break; - default: - GAUXC_GENERIC_EXCEPTION("eval_xmat: den_select not set"); + double * dmat_ptr; + if constexpr (is_trial) { + dmat_ptr = static_stack.tden_selector(den_select); + // now screened trial density matrix is stored in aos_stack.device_tasks[itask].nbe_scr + } else { + dmat_ptr = static_stack.den_selector(den_select); } // Pack density matrix @@ -900,20 +1657,59 @@ void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, d } - data->device_backend_->check_error("xmat" __FILE__ ": " + std::to_string(__LINE__)); + data->device_backend_->check_error("xmat impl" __FILE__ ": " + std::to_string(__LINE__)); // Record completion of BLAS ops on master stream data->device_backend_->sync_master_with_blas_pool(); } +void AoSScheme1Base::eval_xmat( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){ + eval_xmat_impl(fac, _data, do_grad, den_select); +} +void AoSScheme1Base::eval_xmat_trial( double fac, XCDeviceData* _data, bool do_grad, density_id den_select ){ + eval_xmat_impl(fac, _data, do_grad, den_select); +} + +void AoSScheme1Base::save_xmat( XCDeviceData* _data, bool do_grad, density_id den_select ){ + + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + auto backend = data->device_backend_; + auto aos_stack = data->aos_stack; + const auto sz = data->total_nbe_bfn_npts_task_batch; + + switch(den_select) { + case DEN_S: + backend->copy_async(sz, aos_stack.zmat_vxc_device, aos_stack.xmatS_device, "xmatS copy"); + if(do_grad) { + backend->copy_async(sz, aos_stack.xmat_dx_device, aos_stack.xmatS_dx_device, "xmatS_dx copy"); + backend->copy_async(sz, aos_stack.xmat_dy_device, aos_stack.xmatS_dy_device, "xmatS_dy copy"); + backend->copy_async(sz, aos_stack.xmat_dz_device, aos_stack.xmatS_dz_device, "xmatS_dz copy"); + } + break; + case DEN_Z: + backend->copy_async(sz, aos_stack.zmat_vxc_device, aos_stack.xmatZ_device, "xmatZ copy"); + if(do_grad) { + backend->copy_async(sz, aos_stack.xmat_dx_device, aos_stack.xmatZ_dx_device, "xmatZ_dx copy"); + backend->copy_async(sz, aos_stack.xmat_dy_device, aos_stack.xmatZ_dy_device, "xmatZ_dy copy"); + backend->copy_async(sz, aos_stack.xmat_dz_device, aos_stack.xmatZ_dz_device, "xmatZ_dz copy"); + } + break; + default: + GAUXC_GENERIC_EXCEPTION("Save XMat + GKS NYI"); + } +} -void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool do_m ){ +template +void AoSScheme1Base::inc_potential_impl( XCDeviceData* _data, density_id den_selector, bool do_m ){ auto* data = dynamic_cast(_data); if( !data ) GAUXC_BAD_LWD_DATA_CAST(); @@ -952,28 +1748,30 @@ void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool const auto submat_block_size = data->get_submat_chunk_size( nbf, 0 ); auto static_stack = data->static_stack; auto aos_stack = data->aos_stack; - double* vxc_ptr = nullptr; - switch( den_selector ) { - case DEN_S: - vxc_ptr = static_stack.vxc_s_device; - break; - case DEN_Z: - vxc_ptr = static_stack.vxc_z_device; - break; - case DEN_Y: - vxc_ptr = static_stack.vxc_y_device; - break; - case DEN_X: - vxc_ptr = static_stack.vxc_x_device; - break; - default: - GAUXC_GENERIC_EXCEPTION( "inc_vxc called with invalid density selected" ); + + double* potential_ptr; + if constexpr (is_fxc) { + potential_ptr = static_stack.fxc_selector(den_selector); + // cutlass_stack.vmat_array_device points to aos_stack.device_tasks[itask].nbe_scr + } else { + potential_ptr = static_stack.vxc_selector(den_selector); } + + auto vxc_ptr = static_stack.vxc_selector(den_selector); sym_task_inc_potential( ntasks, aos_stack.device_tasks, - vxc_ptr, nbf, submat_block_size, + potential_ptr, nbf, submat_block_size, data->device_backend_->queue() ); - data->device_backend_->check_error("inc_vxc" __FILE__ ": " + std::to_string(__LINE__)); + data->device_backend_->check_error("inc_potential_ptr" __FILE__ ": " + std::to_string(__LINE__)); +} + + +void AoSScheme1Base::inc_vxc( XCDeviceData* _data, density_id den_selector, bool do_m ){ + inc_potential_impl(_data, den_selector, do_m); +} + +void AoSScheme1Base::inc_fxc( XCDeviceData* _data, density_id den_selector, bool do_m ){ + inc_potential_impl(_data, den_selector, do_m); } @@ -1019,10 +1817,43 @@ void AoSScheme1Base::symmetrize_vxc( XCDeviceData* _data, density_id den_selecto data->device_backend_->check_error("symmetrize vxc" __FILE__ ": " + std::to_string(__LINE__)); } +void AoSScheme1Base::symmetrize_fxc( XCDeviceData* _data, density_id den_selector) { + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); -void AoSScheme1Base::inc_exc_grad_lda( XCDeviceData* _data ) { + const auto nbf = data->global_dims.nbf; + auto static_stack = data->static_stack; + switch ( den_selector ) { + case DEN_S: + symmetrize_matrix( nbf, static_stack.fxc_s_device, nbf, + data->device_backend_->queue() ); + break; + case DEN_Z: + symmetrize_matrix( nbf, static_stack.fxc_z_device, nbf, + data->device_backend_->queue() ); + break; + case DEN_Y: + symmetrize_matrix( nbf, static_stack.fxc_y_device, nbf, + data->device_backend_->queue() ); + break; + case DEN_X: + symmetrize_matrix( nbf, static_stack.fxc_x_device, nbf, + data->device_backend_->queue() ); + break; + default: + GAUXC_GENERIC_EXCEPTION( "symmetrize_fxc: invalid density selected" ); + } + + data->device_backend_->check_error("symmetrize fxc" __FILE__ ": " + std::to_string(__LINE__)); +} + + + + +void AoSScheme1Base::inc_exc_grad_lda( XCDeviceData* _data, integrator_ks_scheme ks_scheme, bool with_weight_derivatives ) { #ifdef GAUXC_HAS_HIP GAUXC_GENERIC_EXCEPTION("LDA Grad NYI for HIP Backends"); #else @@ -1032,15 +1863,16 @@ void AoSScheme1Base::inc_exc_grad_lda( XCDeviceData* _data ) { if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); const auto nshell = data->global_dims.nshells; - increment_exc_grad_lda( nshell, + increment_exc_grad_lda( ks_scheme, nshell, data->shell_to_task_stack.shell_to_task_device, data->aos_stack.device_tasks, data->static_stack.exc_grad_device, + with_weight_derivatives, data->device_backend_->queue() ); #endif } -void AoSScheme1Base::inc_exc_grad_gga( XCDeviceData* _data ) { +void AoSScheme1Base::inc_exc_grad_gga( XCDeviceData* _data, integrator_ks_scheme ks_scheme, bool with_weight_derivatives ) { #ifdef GAUXC_HAS_HIP GAUXC_GENERIC_EXCEPTION("GGA Grad NYI for HIP Backends"); #else @@ -1050,10 +1882,30 @@ void AoSScheme1Base::inc_exc_grad_gga( XCDeviceData* _data ) { if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); const auto nshell = data->global_dims.nshells; - increment_exc_grad_gga( nshell, + increment_exc_grad_gga( ks_scheme, nshell, data->shell_to_task_stack.shell_to_task_device, data->aos_stack.device_tasks, data->static_stack.exc_grad_device, + with_weight_derivatives, + data->device_backend_->queue() ); +#endif +} + +void AoSScheme1Base::inc_exc_grad_mgga( XCDeviceData* _data, integrator_ks_scheme ks_scheme, bool need_lapl, bool with_weight_derivatives ) { +#ifdef GAUXC_HAS_HIP + GAUXC_GENERIC_EXCEPTION("MGGA Grad NYI for HIP Backends"); +#else + auto* data = dynamic_cast(_data); + if( !data ) GAUXC_BAD_LWD_DATA_CAST(); + + if( not data->device_backend_ ) GAUXC_UNINITIALIZED_DEVICE_BACKEND(); + + const auto nshell = data->global_dims.nshells; + increment_exc_grad_mgga( ks_scheme, nshell, need_lapl, + data->shell_to_task_stack.shell_to_task_device, + data->aos_stack.device_tasks, + data->static_stack.exc_grad_device, + with_weight_derivatives, data->device_backend_->queue() ); #endif } @@ -1119,15 +1971,23 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, // XXX: Need to add screening capabilities, packing etc //const auto nbf = data->global_dims.nbf; - // XXX: Need to add support for non-cartesian functions - for( auto i = 0ul; i < nshells; ++i ) { - if( basis_map.shell_pure(i) ) - GAUXC_GENERIC_EXCEPTION("GPU EXX + Spherical NYI"); - } if( basis_map.max_l() > 2 ) { GAUXC_GENERIC_EXCEPTION("GPU EXX + L>2 NYI"); } + + // Determine purity of shell types + std::vector sph_am(basis_map.max_l()+1); + for( auto i = 0ul; i < nshells; ++i ) { + sph_am[basis_map.shell_l(i)] = sph_am[basis_map.shell_l(i)] | basis_map.shell_pure(i); + } + + // Sanity Check + for( auto i = 0ul; i < nshells; ++i ) { + if(basis_map.shell_pure(i) != sph_am[basis_map.shell_l(i)]) + GAUXC_GENERIC_EXCEPTION("GPU EXX requires all shells of the same angular momentum to have the same purity"); + } + // Zero out G for( auto& task : tasks ) { @@ -1167,7 +2027,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, data->device_backend_->check_error("integral_0_task_batched" __FILE__ ": " + std::to_string(__LINE__)); if(basis_map.max_l() > 0) { XGPU::integral_1_task_batched( - tasks.size(), data->subtask.size(), + sph_am[1], tasks.size(), data->subtask.size(), data->l_batch_diag_task_to_shell_pair_device[1].max_prim_pairs, 0, data->aos_stack.device_tasks, data->l_batch_diag_task_to_shell_pair_device[1].task_to_shell_pair_device, @@ -1183,7 +2043,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 1) { XGPU::integral_2_task_batched( - tasks.size(), data->subtask.size(), + sph_am[2], tasks.size(), data->subtask.size(), data->l_batch_diag_task_to_shell_pair_device[2].max_prim_pairs, 0, data->aos_stack.device_tasks, data->l_batch_diag_task_to_shell_pair_device[2].task_to_shell_pair_device, @@ -1217,7 +2077,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, if(basis_map.max_l() > 0) { XGPU::integral_1_1_task_batched( - tasks.size(), data->subtask.size(), + sph_am[1], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,1)].max_prim_pairs, 0, data->aos_stack.device_tasks, data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,1)].task_to_shell_pair_device, @@ -1234,7 +2094,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, if(basis_map.max_l() > 1) { XGPU::integral_2_2_task_batched( - tasks.size(), data->subtask.size(), + sph_am[2], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,2)].max_prim_pairs, 0, data->aos_stack.device_tasks, data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,2)].task_to_shell_pair_device, @@ -1250,7 +2110,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 0) { - XGPU::integral_1_0_task_batched( true, + XGPU::integral_1_0_task_batched( true, sph_am[1], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(0,1)].max_prim_pairs, 0, data->aos_stack.device_tasks, @@ -1267,7 +2127,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 0) { - XGPU::integral_1_0_task_batched( false, + XGPU::integral_1_0_task_batched( false, sph_am[1], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,0)].max_prim_pairs, 0, data->aos_stack.device_tasks, @@ -1284,7 +2144,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 1) { - XGPU::integral_2_0_task_batched( true, + XGPU::integral_2_0_task_batched( true, sph_am[2], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(0,2)].max_prim_pairs, 0, data->aos_stack.device_tasks, @@ -1301,7 +2161,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 1) { - XGPU::integral_2_0_task_batched( false, + XGPU::integral_2_0_task_batched( false, sph_am[2], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,0)].max_prim_pairs, 0, data->aos_stack.device_tasks, @@ -1318,7 +2178,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 1) { - XGPU::integral_2_1_task_batched( true, + XGPU::integral_2_1_task_batched( true, sph_am[2], sph_am[1], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(1,2)].max_prim_pairs, 0, data->aos_stack.device_tasks, @@ -1335,7 +2195,7 @@ void AoSScheme1Base::eval_exx_gmat( XCDeviceData* _data, } if(basis_map.max_l() > 1) { - XGPU::integral_2_1_task_batched( false, + XGPU::integral_2_1_task_batched( false, sph_am[2], sph_am[1], tasks.size(), data->subtask.size(), data->l_batch_task_to_shell_pair_device[SP_LBATCH_IDX(2,1)].max_prim_pairs, 0, data->aos_stack.device_tasks, diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp index 37964914..6a04d436 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_base.hpp +++ b/src/xc_integrator/local_work_driver/device/scheme1_base.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -18,25 +22,46 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL { void eval_collocation_gradient( XCDeviceData* ) override final; void eval_collocation_hessian( XCDeviceData* ) override final; void eval_collocation_laplacian( XCDeviceData* ) override final; + void eval_collocation_lapgrad( XCDeviceData* ) override final; void eval_uvars_lda( XCDeviceData*, integrator_ks_scheme ) override final; void eval_uvars_gga( XCDeviceData*, integrator_ks_scheme ) override final; - void eval_uvars_mgga( XCDeviceData*, bool ) override final; - void eval_vvar( XCDeviceData*, density_id, bool ) override final; + void eval_uvars_mgga( XCDeviceData*, integrator_ks_scheme, bool ) override final; + void eval_vvars_lda ( XCDeviceData*, density_id ) override final; + void eval_vvars_gga ( XCDeviceData*, density_id ) override final; + void eval_vvars_mgga( XCDeviceData*, density_id, bool ) override final; + + void eval_tmat_lda( XCDeviceData*, integrator_ks_scheme ) override final; + void eval_tmat_gga( XCDeviceData*, integrator_ks_scheme ) override final; + void eval_tmat_mgga( XCDeviceData*, integrator_ks_scheme, bool ) override final; + void eval_vvars_lda_trial ( XCDeviceData*, density_id ) override final; + void eval_vvars_gga_trial ( XCDeviceData*, density_id ) override final; + void eval_vvars_mgga_trial( XCDeviceData*, density_id, bool ) override final; void eval_zmat_lda_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final; void eval_zmat_gga_vxc( XCDeviceData*, integrator_ks_scheme, density_id ) override final; - void eval_zmat_mgga_vxc( XCDeviceData*, bool ) override final; - void eval_mmat_mgga_vxc( XCDeviceData*, bool ) override final; + void eval_zmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) override final; + void eval_mmat_mgga_vxc( XCDeviceData*, integrator_ks_scheme, bool, density_id ) override final; + + void eval_zmat_lda_fxc( XCDeviceData*, density_id ) override final; + void eval_zmat_gga_fxc( XCDeviceData*, density_id ) override final; + void eval_zmat_mgga_fxc( XCDeviceData*, bool, density_id ) override final; + void eval_mmat_mgga_fxc( XCDeviceData*, bool, density_id ) override final; void eval_kern_exc_vxc_lda( const functional_type&, XCDeviceData* ) override final; void eval_kern_exc_vxc_gga( const functional_type&, XCDeviceData* ) override final; void eval_kern_exc_vxc_mgga( const functional_type&, XCDeviceData* ) override final; + void eval_kern_vxc_fxc_lda( const functional_type&, XCDeviceData* ) override final; + void eval_kern_vxc_fxc_gga( const functional_type&, XCDeviceData* ) override final; + void eval_kern_vxc_fxc_mgga( const functional_type&, XCDeviceData* ) override final; + void inc_exc( XCDeviceData* ) override final; void inc_nel( XCDeviceData* ) override final; - void inc_exc_grad_lda( XCDeviceData* ) override final; - void inc_exc_grad_gga( XCDeviceData* ) override final; + void inc_exc_grad_lda( XCDeviceData*, integrator_ks_scheme, bool ) override final; + void inc_exc_grad_gga( XCDeviceData*, integrator_ks_scheme, bool ) override final; + void inc_exc_grad_mgga( XCDeviceData*, integrator_ks_scheme , bool, bool ) override final; void symmetrize_vxc( XCDeviceData* , density_id) override final; + void symmetrize_fxc( XCDeviceData* , density_id) override final; void symmetrize_exx_k( XCDeviceData* ) override final; //void eval_exx_gmat( XCDeviceData* ) override final; void eval_exx_gmat( XCDeviceData*, const BasisSetMap& ) override final; @@ -46,11 +71,19 @@ struct AoSScheme1Base : public detail::LocalDeviceWorkDriverPIMPL { XCDeviceData*, host_task_iterator, host_task_iterator, const ShellPairCollection& ) override final; + void save_xmat( XCDeviceData*, bool do_grad, density_id den ) override final; + // Overridable APIs + template + void eval_xmat_impl(double fac, XCDeviceData*, bool do_grad, density_id ); + template + void inc_potential_impl(XCDeviceData*, density_id, bool do_m); virtual void eval_xmat( double fac, XCDeviceData*, bool , density_id ) override; + virtual void eval_xmat_trial( double fac, XCDeviceData*, bool , density_id ) override; virtual void eval_exx_fmat( XCDeviceData* ) override; virtual void inc_vxc( XCDeviceData*, density_id, bool ) override; + virtual void inc_fxc( XCDeviceData*, density_id, bool ) override; virtual void inc_exx_k( XCDeviceData* ) override; diff --git a/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx index 414218e0..7818a5a8 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_data_base.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -53,7 +57,8 @@ size_t Scheme1DataBase::get_static_mem_requirement() { // Task Map nsp * sizeof(int32_t) + // nprim_pairs nsp * sizeof(shell_pair*) + // shell_pair pointer - nsp * 3 * sizeof(double); // X_AB, Y_AB, Z_AB + nsp * 3 * sizeof(double) + // X_AB, Y_AB, Z_AB + 1024 * 1024; // additional memory for alignment padding return size; } @@ -849,7 +854,7 @@ void Scheme1DataBase::add_extra_to_indirection( } } - if( terms.exx or terms.exc_vxc or terms.exc_grad or terms.den or terms.exx_ek_screening ) { + if( terms.exx or terms.exc_vxc or terms.exc_grad or terms.den or terms.exx_ek_screening or terms.fxc_contraction ) { const size_t total_nshells_bfn = total_nshells_bfn_task_batch * sizeof(size_t); buffer_adaptor shell_list_bfn_mem( collocation_stack.shell_list_device, total_nshells_bfn ); diff --git a/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp index cc8aea6e..870cc6e0 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp +++ b/src/xc_integrator/local_work_driver/device/scheme1_data_base.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx index 8c3c15cc..095564f4 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp index 99dde319..21242a40 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp +++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_base.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx index 95b818be..af48b439 100644 --- a/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx +++ b/src/xc_integrator/local_work_driver/device/scheme1_magma_data_base.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/factory.cxx b/src/xc_integrator/local_work_driver/factory.cxx index 744b58b4..fd6b86ad 100644 --- a/src/xc_integrator/local_work_driver/factory.cxx +++ b/src/xc_integrator/local_work_driver/factory.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/CMakeLists.txt index a049c0c1..aa68ae28 100644 --- a/src/xc_integrator/local_work_driver/host/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/host/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/host/blas.cxx b/src/xc_integrator/local_work_driver/host/blas.cxx index c7d13dfe..d0b74596 100644 --- a/src/xc_integrator/local_work_driver/host/blas.cxx +++ b/src/xc_integrator/local_work_driver/host/blas.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/blas.hpp b/src/xc_integrator/local_work_driver/host/blas.hpp index 8aa8e7d3..54a279b5 100644 --- a/src/xc_integrator/local_work_driver/host/blas.hpp +++ b/src/xc_integrator/local_work_driver/host/blas.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx index 04cf7b0a..0fa970ba 100644 --- a/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx +++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -37,6 +41,16 @@ void LocalHostWorkDriver::partition_weights( XCWeightAlg weight_alg, } +void LocalHostWorkDriver::eval_weight_1st_deriv_contracted( + XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, + const XCTask& task, const double* w_times_f, double* exc_grad_w ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_weight_1st_deriv_contracted(weight_alg, mol, meta, task, + w_times_f, exc_grad_w); + +} + // Collocation void LocalHostWorkDriver::eval_collocation( size_t npts, size_t nshells, size_t nbe, @@ -272,7 +286,6 @@ void LocalHostWorkDriver::eval_zmat_lda_vxc_uks( size_t npts, size_t nbe, pimpl_->eval_zmat_lda_vxc_uks(npts, nbe, vrho, basis_eval, Zs, ldzs, Zz, ldzz); - } void LocalHostWorkDriver::eval_zmat_lda_vxc_gks( size_t npts, size_t nbe, @@ -400,5 +413,158 @@ void LocalHostWorkDriver::inc_vxc( size_t npts, size_t nbf, size_t nbe, } +// eval_tmat LDA RKS +void LocalHostWorkDriver::eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* trho, double* A) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_tmat_lda_vxc_rks(npts, v2rho2, trho, A); + +} + +// eval_tmat GGA RKS +void LocalHostWorkDriver::eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_tmat_gga_vxc_rks(npts, vgamma, v2rho2, v2rhogamma, v2gamma2, + tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, + dden_z_eval, A, B); + +} + +// eval_tmat MGGA RKS +void LocalHostWorkDriver::eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau, + const double* v2lapl2, const double* v2lapltau, const double* v2tau2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_tmat_mgga_vxc_rks(npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, + v2gamma2, v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, + tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, + dden_y_eval, dden_z_eval, A, B, C); + +} + +void LocalHostWorkDriver::eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_tmat_lda_vxc_uks(npts, v2rho2, trho, A); + +} +void LocalHostWorkDriver::eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_tmat_gga_vxc_uks(npts, vgamma, v2rho2, v2rhogamma, v2gamma2, + trho, tdden_x_eval, tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, + dden_z_eval, A, B); + +} +void LocalHostWorkDriver::eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau, + const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_tmat_mgga_vxc_uks(npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, + v2gamma2, v2gammalapl, v2gamma_tau, v2lapl2, v2tau_lapl, v2tau2, + trho, tdden_x_eval, tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, + dden_y_eval, dden_z_eval, A, B, C); + +} + +void LocalHostWorkDriver::eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, + const double* vrho, const double* basis_eval, double* Za, size_t ldza, + double* Zb, size_t ldzb ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_zmat_lda_vxc_uks_ts(npts, nbe, vrho, basis_eval, Za, ldza, + Zb, ldzb); + +} + +void LocalHostWorkDriver::eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_Bvec_gga_vxc_rks_ts(npts, vgamma, dden_x_eval, dden_y_eval, + dden_z_eval, B); +} + +void LocalHostWorkDriver::eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_Bvec_gga_vxc_uks_ts(npts, vgamma, dden_x_eval, dden_y_eval, + dden_z_eval, B); +} +void LocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Z, size_t ldz ){ + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_zmat_gga_vxc_rks_ts(npts, nbf, A, B, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, Z, ldz); +} + +void LocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ){ + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_zmat_gga_vxc_uks_ts(npts, nbf, A, B, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, Za, ldza, Zb, ldzb); +} + +void LocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, + const double* vrho, const double* vgamma, const double* vlapl, + const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + const double* lbasis_eval, const double* dden_x_eval, + const double* dden_y_eval, const double* dden_z_eval, double* Za, size_t ldza, + double* Zb, size_t ldzb) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_zmat_mgga_vxc_uks_ts(npts, nbe, vrho, vgamma, vlapl, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, lbasis_eval, dden_x_eval, dden_y_eval, dden_z_eval, + Za, ldza, Zb, ldzb); +} + +void LocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, + const double* vrho, const double* vgamma, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, const double* dden_x_eval, + const double* dden_y_eval, const double* dden_z_eval, double* Za, size_t ldza, + double* Zb, size_t ldzb ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_zmat_gga_vxc_uks_ts(npts, nbe, vrho, vgamma, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, + Za, ldza, Zb, ldzb); + +} +void LocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, + const double* vtau, const double* vlapl, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs, size_t ldms, + double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz ) { + + throw_if_invalid_pimpl(pimpl_); + pimpl_->eval_mmat_mgga_vxc_uks_ts(npts, nbe, vtau, vlapl, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, mmat_xs, mmat_ys, mmat_zs, ldms, mmat_xz, mmat_yz, + mmat_zz, ldmz ); + +} + + } diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp b/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp index 7b6b73af..41cf430e 100644 --- a/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp +++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -71,6 +75,20 @@ class LocalHostWorkDriver : public LocalWorkDriver { void partition_weights( XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, task_iterator task_begin, task_iterator task_end ); + /** Evaluate the weight first derivative contracted with a function + * + * @param[in] weight_alg Molecular partitioning scheme + * @param[in] mol Molecule being partitioned + * @param[in] molmeta Metadata associated with mol + * @param[in] task Task Data + * @param[in] w_times_f Weight times function evaluation + * + * @param[in/out] exc_grad_w Weight first derivative times function evaluation (added to this array) + * Assumed to have length 3*natoms. Example: exc_grad + */ + void eval_weight_1st_deriv_contracted( XCWeightAlg weight_alg, const Molecule& mol, + const MolMeta& meta, const XCTask& task, const double* w_times_f, double* exc_grad_w ); + /** Evaluation the collocation matrix * @@ -333,7 +351,7 @@ class LocalHostWorkDriver : public LocalWorkDriver { double* den_eval, double* dden_x_eval, double* dden_y_eval, double* dden_z_eval, double* gamma, double* tau, double* lapl); - /** Evaluate the VXC Z Matrix for RKS LDA + /** Evaluate the VXC Z Matrix for RKS LDA * * Z(mu,i) = 0.5 * vrho(i) * B(mu, i) * @@ -469,6 +487,113 @@ class LocalHostWorkDriver : public LocalWorkDriver { const submat_map_t& submat_map, const double* Z, size_t ldz, double* VXC, size_t ldvxc, double* scr ); + /** Evaluate the intermediate vector variables tmat for Fxc contraction of LDA + * + * See Jiashu's notes for details + * + * @param[in] npts The number of points to evaluate the U/V variables + * @param[in] v2rho2 the second derivative of the XC functional wrt rho + * @param[in] trho The trial density calculated from the trial density matrix + * @param[out] A intermediate output to form zmat (npts, 1) for RKS, (npts, 2) for UKS + * + */ + void eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* trho, double* A); + void eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A); + + /** + * Evaluate the intermediate vector variables tmat for Fxc contraction of GGA + * + * See Jiashu's notes for details + * + * @param[in] npts The number of points to evaluate the U/V variables + * @param[in] vgamma the derivative of the XC functional wrt gamma + * @param[in] v2rho2 the second derivative of the XC functional wrt rho twice + * @param[in] v2rhogamma the second derivative of the XC functional wrt rho and gamma + * @param[in] v2gamma2 the second derivative of the XC functional wrt gamma twice + * @param[in] tden_eval The trial density calculated from the trial density matrix + * @param[in] tdden_x_eval the gradient of the trial density calculated from the trial density matrix, similar for y and z + * @param[in] dden_x_eval the gradient of the density (npts) calculated from the density matrix, similar for y and z + * @param[out] A intermediate output to form zmat (npts, 1) for RKS, (npts, 2) for UKS + * @param[out] B intermediate output to form zmat (npts, 3) for RKS, (npts, 6) for UKS + */ + void eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ); + void eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ); + + /** + * Evaluate the intermediate vector variables tmat for Fxc contraction of MGGA + * + * See Jiashu's notes for details + * + * @param[in] npts The number of points to evaluate the U/V variables + * @param[in] vgamma the derivative of the XC functional wrt gamma + * @param[in] v2rho2 the second derivative of the XC functional wrt rho twice + * @param[in] v2rhogamma the second derivative of the XC functional wrt rho and gamma + * @param[in] v2rholapl the second derivative of the XC functional wrt rho and laplacian + * @param[in] v2rhotau the second derivative of the XC functional wrt rho and tau + * @param[in] v2gamma2 the second derivative of the XC functional wrt gamma twice + * @param[in] v2gammalapl the second derivative of the XC functional wrt gamma and laplacian + * @param[in] v2gammatau the second derivative of the XC functional wrt gamma and tau + * @param[in] v2lapl2 the second derivative of the XC functional wrt laplacian twice + * @param[in] v2lapltau the second derivative of the XC functional wrt laplacian and tau + * @param[in] v2tau2 the second derivative of the XC functional wrt tau twice + * @param[in] tden_eval The trial density calculated from the trial density matrix + * @param[in] tdden_x_eval the gradient of the trial density calculated from the trial density matrix, similar for y and z + * @param[in] dden_x_eval the gradient of the density (npts) calculated from the density matrix, similar for y and z + * @param[in] ttau the kinetic energy density calculated from the trial density matrix + * @param[out] A intermediate output to form zmat (npts, 1) for RKS, (npts, 2) for UKS + * @param[out] B intermediate output to form zmat (npts, 3) for RKS, (npts, 6) for UKS + * @param[out] C intermediate output to form mmat (npts, 1) for RKS, (npts, 2) for UKS + */ + void eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau, + const double* v2lapl2, const double* v2lapltau, const double* v2tau2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C); + void eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau, + const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C); + + + void eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* basis_eval, double* Za, size_t ldza, double* Zb, + size_t ldzb ); + void eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ); + void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ); + void eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ); + void eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Z, size_t ldz ); + + void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* vgamma, const double* basis_eval, const double* dbasis_x_eval, + const double* dbasis_y_eval, const double* dbasis_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ); + void eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* vgamma, const double* vlapl, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + const double* lbasis_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ); + void eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vtau, + const double* vlapl, const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs, + size_t ldms, double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz); + private: pimpl_type pimpl_; ///< Implementation diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx index c7ffc94f..aac879ea 100644 --- a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx +++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp index cdcac019..c5e4182d 100644 --- a/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp +++ b/src/xc_integrator/local_work_driver/host/local_host_work_driver_pimpl.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -29,6 +33,9 @@ struct LocalHostWorkDriverPIMPL { virtual void partition_weights( XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, task_iterator task_begin, task_iterator task_end ) = 0; + + virtual void eval_weight_1st_deriv_contracted( XCWeightAlg weight_alg, const Molecule& mol, + const MolMeta& meta, const XCTask& task, const double* w_times_f, double* exc_grad_w ) = 0; virtual void eval_collocation( size_t npts, size_t nshells, size_t nbe, const double* pts, const BasisSet& basis, const int32_t* shell_list, @@ -170,6 +177,62 @@ struct LocalHostWorkDriverPIMPL { const double* basis_eval, const submat_map_t& submat_map, const double* Z, size_t ldz, double* VXC, size_t ldvxc, double* scr ) = 0; + virtual void eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* tden_eval, double* A) = 0; + virtual void eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A) = 0; + + virtual void eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) = 0; + virtual void eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) = 0; + + virtual void eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau, + const double* v2lapl2, const double* v2lapltau, const double* v2tau2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) = 0; + virtual void eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau, + const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) = 0; + + virtual void eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* basis_eval, double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0; + + virtual void eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) = 0; + virtual void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0; + virtual void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* vgamma, const double* basis_eval, const double* dbasis_x_eval, + const double* dbasis_y_eval, const double* dbasis_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0; + + virtual void eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) = 0; + virtual void eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Z, size_t ldz ) = 0; + + virtual void eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* vgamma, const double* vlapl, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + const double* lbasis_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ) = 0; + virtual void eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vtau, + const double* vlapl, const double* dbasis_x_eval, const double* dbasis_y_eval, +const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs, + size_t ldms, double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz ) = 0; + }; diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt index b103b4d4..ffa52ffe 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/host/obara_saika/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp index 718505b8..bcafebe2 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/chebyshev_boys_computation.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp index dca69c4a..bbb5c455 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/integral_data_types.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp index 561a331f..d056b810 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/include/cpu/obara_saika_integrals.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx index 7a79fc85..02b4f767 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/chebyshev_boys_computation.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp index 42be0db0..8b7cee2a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/config_obara_saika.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx index 31d80a0d..c64d2d54 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp index 1cdd66b6..5db799bc 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx index ab3e29a6..6971c1a7 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp index 1b1c41a2..95f3db8e 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_0_0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx index 13038d0e..3638d86a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp index 8419d9cd..26508811 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx index 4d7ab1e7..d0e65541 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp index b8906bbd..5e6df7ce 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx index ead10e27..ee58d18f 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp index 5bc89eb4..3f06119b 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_1_1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx index 5cc25f38..035be5be 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp index 2e3c9a6a..187e1666 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx index c99a4e9a..0343e667 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp index 19af53f6..a641b325 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx index 0013a3a0..6904c15d 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp index a2f23924..6d7beb15 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx index 0fb99e41..dbd9f500 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp index 8a1ba8bb..faf5b123 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_2_2.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx index 8d946235..c3faf7f4 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp index d2e51280..3e8cb075 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx index cbfcb0fb..44c3542e 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp index 5872e178..7211ec7a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx index 904fc5d0..197e948a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp index e1d74748..106a4f14 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx index 0092aef5..7c4a2ec6 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp index 3c86c167..a69ba836 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_2.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx index 1e49b804..251de89d 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp index 459f2359..0fc00c9e 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_3_3.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx index 714ef6d1..67a9cace 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp index af1c47ec..abb9a2b5 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx index 6c6a4ebc..1b2f57f1 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp index 4c12ac6b..590062ba 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_0.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx index 6efdc4f0..6fefd787 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp index 9a320cd0..4f4e71d8 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_1.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx index 25b02b63..0a88c5dd 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp index fd642fe5..2cc57370 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_2.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx index 8aa7efbf..e318e860 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp index 2bc46333..e750cc98 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_3.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx index 01da2812..5aca482a 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp index 79bc467b..2583fc79 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/integral_4_4.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx index 305357e3..5fa3c657 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/src/obara_saika_integrals.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx index 46154e34..19921718 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx index ea1671aa..bbb30ddb 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx index 0ac029a2..ebb02db6 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test2.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx index 826311ce..9d588ba4 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test3.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx index 3b20d5c1..42941ef5 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx index e6592a94..455f715b 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v0.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx index 0b206e6a..39c51766 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v1.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx index f728c1d5..b992fcc0 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v2.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx index 4af600b0..325b8b3f 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v3.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx index 3ee4380b..5cf97532 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/archive/test_boys_v4.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx b/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx index 6ac1f555..e2378e6c 100644 --- a/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx +++ b/src/xc_integrator/local_work_driver/host/obara_saika/test/test_experimental.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/reference/collocation.hpp b/src/xc_integrator/local_work_driver/host/reference/collocation.hpp index 0db9ed9c..bab6a076 100644 --- a/src/xc_integrator/local_work_driver/host/reference/collocation.hpp +++ b/src/xc_integrator/local_work_driver/host/reference/collocation.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx b/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx index 4f1583d0..98f53d35 100644 --- a/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx +++ b/src/xc_integrator/local_work_driver/host/reference/gau2grid_collocation.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/reference/weights.cxx b/src/xc_integrator/local_work_driver/host/reference/weights.cxx index f9f0eb75..145bfd1c 100644 --- a/src/xc_integrator/local_work_driver/host/reference/weights.cxx +++ b/src/xc_integrator/local_work_driver/host/reference/weights.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -108,6 +112,8 @@ void reference_becke_weights_host( } + + void reference_ssf_weights_host( const Molecule& mol, const MolMeta& meta, @@ -172,6 +178,7 @@ void reference_ssf_weights_host( // Evaluate unnormalized partition functions std::fill(partitionScratch.begin(),partitionScratch.end(),1.); +#if 1 for( size_t iA = 0; iA < natoms; iA++ ) for( size_t jA = 0; jA < iA; jA++ ) if( partitionScratch[iA] > integrator::ssf_weight_tol or @@ -196,6 +203,24 @@ void reference_ssf_weights_host( } } +#else + for(size_t iA = 0; iA < natoms; ++iA) + for(size_t jA = 0; jA < natoms; ++jA) + if(iA != jA and partitionScratch[iA] > integrator::ssf_weight_tol) { + const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; + if( fabs(mu) < integrator::magic_ssf_factor<> ) { + double g = 0.5 * (1. - gFrisch(mu)); + partitionScratch[iA] *= g; + } else if(mu >= integrator::magic_ssf_factor<>) { + partitionScratch[iA] = 0.0; + } + } + + if(partitionScratch[task.iParent] < std::numeric_limits::epsilon()) { + weight = 0; + continue; + } +#endif // Normalization double sum = 0.; @@ -360,4 +385,604 @@ void reference_lko_weights_host( } + +/** + * 1st derivative which expects weight_deri to be preallocated as (ngrid*natoms*3) + */ +void reference_becke_weights_1st_derivative_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + double* weight_deri +){ + + // Becke partition functions + auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 + auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 + auto tBecke = [&](double x) { + // for numerical stability (see Jiashu's notes for details) + if (x > 1.0 - 1e-4) + return 0.0; + const double p1 = hBecke(x); + const double p2 = hBecke(p1); + return - 27.0 * (1. + p2) * (1. + p1) * (1. + x) / (1. - x) / (2. + p2) / (2. + p1) / (2. + x); + }; + + const size_t natoms = mol.natoms(); + const auto& RAB = meta.rab(); + std::vector partitionScratch( natoms ); + std::vector atomDist( natoms ); + + for( size_t i = 0; i < task.points.size(); ++i ) { + + auto * weight_deri_ith = weight_deri + 3*natoms*i; + const size_t iParent = task.iParent; + + //zerofy the derivative + std::fill(weight_deri_ith, weight_deri_ith + 3*natoms, 0.); + const auto& point = task.points[i]; + const auto& weight = task.weights[i]; + + // Compute distances of each center to point + for(size_t iA = 0; iA < natoms; iA++) { + + const double da_x = point[0] - mol[iA].x; + const double da_y = point[1] - mol[iA].y; + const double da_z = point[2] - mol[iA].z; + + atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); + + } + + // Evaluate unnormalized partition functions + std::fill(partitionScratch.begin(),partitionScratch.end(),1.); + for( size_t iA = 0; iA < natoms; iA++ ) + for( size_t jA = 0; jA < iA; jA++ ){ + + double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; + const double g = gBecke(mu); + + partitionScratch[iA] *= 0.5 * (1. - g); + partitionScratch[jA] *= 0.5 * (1. + g); + } + + double sum = 0.; + for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; + + // calculate derivative now + auto * weight_deri_iParent = weight_deri_ith + 3*iParent; + for( size_t iB = 0; iB < natoms; iB++ ) { + if (iB == iParent) continue; + auto * weight_deri_iB = weight_deri_ith + 3*iB; + + const double uB_x = mol[iB].x - point[0]; + const double uB_y = mol[iB].y - point[1]; + const double uB_z = mol[iB].z - point[2]; + + const double uBA_x =mol[iB].x - mol[iParent].x; + const double uBA_y =mol[iB].y - mol[iParent].y; + const double uBA_z =mol[iB].z - mol[iParent].z; + const double rAB = RAB[iB + iParent*natoms]; + + double mu_AB = (atomDist[iParent] - atomDist[iB]) / rAB; + + // first term is - coef1 * nabla_B mu_BA + double coef1 = tBecke(mu_AB); + weight_deri_iB[0] -= coef1 / rAB * (uB_x / atomDist[iB] + mu_AB * uBA_x /rAB); + weight_deri_iB[1] -= coef1 / rAB * (uB_y / atomDist[iB] + mu_AB * uBA_y /rAB); + weight_deri_iB[2] -= coef1 / rAB * (uB_z / atomDist[iB] + mu_AB * uBA_z /rAB); + + double term_x = 0.0, term_y = 0.0, term_z = 0.0; + // second term is 1/Z * \sum_{C != B} (P(B)t_BC - P(C)t_CB) nabla_B mu_BC + for( size_t iC = 0; iC < natoms; iC++ ){ + if (iB == iC) continue; + + // coef = (P(B)t_BC - P(C)t_CB) + double mu_BC = (atomDist[iB] - atomDist[iC]) / RAB[iC + iB*natoms]; + double t_BC = tBecke(mu_BC); + double t_CB = tBecke(-mu_BC); + double coef = partitionScratch[iB] *t_BC - partitionScratch[iC] * t_CB; + + const double rBC = RAB[iC + iB*natoms]; + + term_x += coef * ((mol[iB].x - point[0]) / atomDist[iB] / rBC - mu_BC * (mol[iB].x - mol[iC].x) / rBC / rBC); + term_y += coef * ((mol[iB].y - point[1]) / atomDist[iB] / rBC - mu_BC * (mol[iB].y - mol[iC].y) / rBC / rBC); + term_z += coef * ((mol[iB].z - point[2]) / atomDist[iB] / rBC - mu_BC * (mol[iB].z - mol[iC].z) / rBC / rBC); + } + + weight_deri_iB[0] -= term_x / sum; + weight_deri_iB[1] -= term_y / sum; + weight_deri_iB[2] -= term_z / sum; + + // Use translational invariance to calculate the derivative for the parent atom + weight_deri_iParent[0] -= weight_deri_iB[0]; + weight_deri_iParent[1] -= weight_deri_iB[1]; + weight_deri_iParent[2] -= weight_deri_iB[2]; + + } + + // Finally, scale the derivatives by the weight + for( size_t iB = 0; iB < natoms; iB++ ) + for (size_t coord = 0; coord < 3; ++coord) + weight_deri_ith[3*iB + coord] *= weight; + + } +} + +void reference_ssf_weights_1st_derivative_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + double* weight_deri +){ + + const auto safe_magic_ssf_bound = integrator::magic_ssf_factor<> - 1e-4; + + auto gFrisch = [&](double x) { + const double s_x = x / integrator::magic_ssf_factor<>; + const double s_x2 = s_x * s_x; + const double s_x3 = s_x * s_x2; + const double s_x5 = s_x3 * s_x2; + const double s_x7 = s_x5 * s_x2; + + return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; + }; + auto tFrisch = [&](double x) { + const double s_x = x / integrator::magic_ssf_factor<>; + const double s_x2 = s_x * s_x; + const double s_x3 = s_x * s_x2; + const double numerator = 35. * (s_x3 + 3. * s_x2 + 3. * s_x + 1.); + const double denominator = (x - integrator::magic_ssf_factor<>) * (5.*s_x3 + 20.*s_x2 + 29.*s_x + 16.); + return numerator / denominator ; + }; + + const size_t natoms = mol.natoms(); + const auto& RAB = meta.rab(); + std::vector partitionScratch( natoms ); + std::vector atomDist( natoms ); + + for( size_t i = 0; i < task.points.size(); ++i ) { + + auto * weight_deri_ith = weight_deri + 3*natoms*i; + + //zerofy the derivative + std::fill(weight_deri_ith, weight_deri_ith + 3*natoms, 0.); + const auto& weight = task.weights[i]; + + if (std::abs(weight) < 1.e-12) continue; // weight derivative = 0 when p_A = 0 + const size_t iParent = task.iParent; + + const auto& point = task.points[i]; + + const auto dist_cutoff = 0.5 * (1-integrator::magic_ssf_factor<>) * task.dist_nearest; + + // Compute dist to parent atom + { + const double da_x = point[0] - mol[iParent].x; + const double da_y = point[1] - mol[iParent].y; + const double da_z = point[2] - mol[iParent].z; + + atomDist[iParent] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); + } + + if( atomDist[iParent] < dist_cutoff ) continue; // weight derivative = 0 when p_A = 1 + + // Compute distances of each center to point + for(size_t iA = 0; iA < natoms; iA++) { + + if( iA == (size_t)iParent ) continue; + + const double da_x = point[0] - mol[iA].x; + const double da_y = point[1] - mol[iA].y; + const double da_z = point[2] - mol[iA].z; + + atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); + + } + + // Evaluate unnormalized partition functions + std::fill(partitionScratch.begin(),partitionScratch.end(),1.); + + for( size_t iA = 0; iA < natoms; iA++ ) + for( size_t jA = 0; jA < iA; jA++ ) + if( partitionScratch[iA] > integrator::ssf_weight_tol or + partitionScratch[jA] > integrator::ssf_weight_tol ) { + + const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; + + if( mu <= -integrator::magic_ssf_factor<> ) { + + partitionScratch[jA] = 0.; + + } else if (mu >= integrator::magic_ssf_factor<>) { + + partitionScratch[iA] = 0.; + + } else { + + double g = 0.5 * ( 1. - gFrisch(mu) ); + partitionScratch[iA] *= g; + partitionScratch[jA] *= 1. - g; + + } + + } + + // Normalization + double sum = 0.; + for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; + + // calculate derivative now + auto * weight_deri_iParent = weight_deri_ith + 3*iParent; + for( size_t iB = 0; iB < natoms; iB++ ) { + if (iB == iParent) continue; + auto * weight_deri_iB = weight_deri_ith + 3*iB; + + const double rAB = RAB[iB + iParent*natoms]; + double mu_AB = (atomDist[iParent] - atomDist[iB]) / rAB; + if(mu_AB > - integrator::magic_ssf_factor<> && mu_AB < safe_magic_ssf_bound){ + const double uB_x = mol[iB].x - point[0]; + const double uB_y = mol[iB].y - point[1]; + const double uB_z = mol[iB].z - point[2]; + + const double uBA_x =mol[iB].x - mol[iParent].x; + const double uBA_y =mol[iB].y - mol[iParent].y; + const double uBA_z =mol[iB].z - mol[iParent].z; + + // first term is - coef1 * nabla_B mu_BA + double coef1 = tFrisch(mu_AB) * (sum - partitionScratch[iParent])/sum; + weight_deri_iB[0] -= coef1 / rAB * (uB_x / atomDist[iB] + mu_AB * uBA_x /rAB); + weight_deri_iB[1] -= coef1 / rAB * (uB_y / atomDist[iB] + mu_AB * uBA_y /rAB); + weight_deri_iB[2] -= coef1 / rAB * (uB_z / atomDist[iB] + mu_AB * uBA_z /rAB); + } + + if (std::abs(partitionScratch[iB]) < 1.e-12) continue; // no contribution to the derivative if partition function is zero + + double term_x = 0.0, term_y = 0.0, term_z = 0.0; + for( size_t iC = 0; iC < natoms; iC++ ){ + if (iB == iC) continue; + const double rBC = RAB[iC + iB*natoms]; + double mu_BC = (atomDist[iB] - atomDist[iC]) / rBC; + if(mu_BC > - safe_magic_ssf_bound && mu_BC < safe_magic_ssf_bound){ + double t_BC = tFrisch(mu_BC); + double coef = partitionScratch[iB] * t_BC / rBC/ sum; + + term_x += coef * ((mol[iB].x - point[0]) / atomDist[iB] - mu_BC * (mol[iB].x - mol[iC].x) / rBC); + term_y += coef * ((mol[iB].y - point[1]) / atomDist[iB] - mu_BC * (mol[iB].y - mol[iC].y) / rBC); + term_z += coef * ((mol[iB].z - point[2]) / atomDist[iB] - mu_BC * (mol[iB].z - mol[iC].z) / rBC); + + if(iC != iParent) { + auto * weight_deri_iC = weight_deri_ith + 3*iC; + weight_deri_iC[0] += coef * ( (mol[iC].x - point[0]) / atomDist[iC] + mu_BC * (mol[iC].x - mol[iB].x) / rBC ); + weight_deri_iC[1] += coef * ( (mol[iC].y - point[1]) / atomDist[iC] + mu_BC * (mol[iC].y - mol[iB].y) / rBC ); + weight_deri_iC[2] += coef * ( (mol[iC].z - point[2]) / atomDist[iC] + mu_BC * (mol[iC].z - mol[iB].z) / rBC ); + } + + } + } + weight_deri_iB[0] -= term_x; + weight_deri_iB[1] -= term_y; + weight_deri_iB[2] -= term_z; + } + + // Use translational invariance to calculate the derivative for the parent atom + for( size_t iB = 0; iB < natoms; iB++ ) { + if (iB == iParent) continue; + auto * weight_deri_iB = weight_deri_ith + 3*iB; + weight_deri_iParent[0] -= weight_deri_iB[0]; + weight_deri_iParent[1] -= weight_deri_iB[1]; + weight_deri_iParent[2] -= weight_deri_iB[2]; + } + + // Finally, scale the derivatives by the weight + for( size_t iB = 0; iB < natoms; iB++ ) + for (size_t coord = 0; coord < 3; ++coord) + weight_deri_ith[3*iB + coord] *= weight; + + } +} + + + +/** + * 1st derivative with contraction + */ +void reference_becke_weights_1std_contraction_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + const double* w_times_f, + double* exc_grad_w +){ + + // Becke partition functions + auto hBecke = [](double x) {return 1.5 * x - 0.5 * x * x * x;}; // Eq. 19 + auto gBecke = [&](double x) {return hBecke(hBecke(hBecke(x)));}; // Eq. 20 f_3 + auto tBecke = [&](double x) { + // for numerical stability (see Jiashu's notes for details) + if (x > 1.0 - 1e-4) + return 0.0; + const double p1 = hBecke(x); + const double p2 = hBecke(p1); + return - 27.0 * (1. + p2) * (1. + p1) * (1. + x) / (1. - x) / (2. + p2) / (2. + p1) / (2. + x); + }; + + const size_t natoms = mol.natoms(); + const auto& RAB = meta.rab(); + std::vector partitionScratch( natoms ); + std::vector atomDist( natoms ); + + for( size_t i = 0; i < task.points.size(); ++i ) { + + const size_t iParent = task.iParent; + const auto& point = task.points[i]; + const auto w_times_f_i = w_times_f[i]; + + // Compute distances of each center to point + for(size_t iA = 0; iA < natoms; iA++) { + + const double da_x = point[0] - mol[iA].x; + const double da_y = point[1] - mol[iA].y; + const double da_z = point[2] - mol[iA].z; + + atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); + + } + + // Evaluate unnormalized partition functions + std::fill(partitionScratch.begin(),partitionScratch.end(),1.); + for( size_t iA = 0; iA < natoms; iA++ ) + for( size_t jA = 0; jA < iA; jA++ ){ + + double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; + const double g = gBecke(mu); + + partitionScratch[iA] *= 0.5 * (1. - g); + partitionScratch[jA] *= 0.5 * (1. + g); + } + + double sum = 0.; + for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; + + // calculate derivative now + for( size_t iB = 0; iB < natoms; iB++ ) { + if (iB == iParent) continue; + double exc_grad_w_iBx = 0.0, exc_grad_w_iBy = 0.0, exc_grad_w_iBz = 0.0; + + const double uB_x = mol[iB].x - point[0]; + const double uB_y = mol[iB].y - point[1]; + const double uB_z = mol[iB].z - point[2]; + + const double uBA_x =mol[iB].x - mol[iParent].x; + const double uBA_y =mol[iB].y - mol[iParent].y; + const double uBA_z =mol[iB].z - mol[iParent].z; + const double rAB = RAB[iB + iParent*natoms]; + + double mu_AB = (atomDist[iParent] - atomDist[iB]) / rAB; + + // first term is - coef1 * nabla_B mu_BA + double coef1 = tBecke(mu_AB) * w_times_f_i; + exc_grad_w_iBx = - coef1 / rAB * (uB_x / atomDist[iB] + mu_AB * uBA_x /rAB); + exc_grad_w_iBy = - coef1 / rAB * (uB_y / atomDist[iB] + mu_AB * uBA_y /rAB); + exc_grad_w_iBz = - coef1 / rAB * (uB_z / atomDist[iB] + mu_AB * uBA_z /rAB); + + // second term is 1/Z * \sum_{C != B} (P(B)t_BC - P(C)t_CB) nabla_B mu_BC + for( size_t iC = 0; iC < natoms; iC++ ){ + if (iB == iC) continue; + + // coef = (P(B)t_BC - P(C)t_CB) + double mu_BC = (atomDist[iB] - atomDist[iC]) / RAB[iC + iB*natoms]; + double t_BC = tBecke(mu_BC); + double t_CB = tBecke(-mu_BC); + double coef = (partitionScratch[iB] *t_BC - partitionScratch[iC] * t_CB)/ sum * w_times_f_i; + + const double rBC = RAB[iC + iB*natoms]; + + exc_grad_w_iBx -= coef * ((mol[iB].x - point[0]) / atomDist[iB] / rBC - mu_BC * (mol[iB].x - mol[iC].x) / rBC / rBC); + exc_grad_w_iBy -= coef * ((mol[iB].y - point[1]) / atomDist[iB] / rBC - mu_BC * (mol[iB].y - mol[iC].y) / rBC / rBC); + exc_grad_w_iBz -= coef * ((mol[iB].z - point[2]) / atomDist[iB] / rBC - mu_BC * (mol[iB].z - mol[iC].z) / rBC / rBC); + } + + #pragma omp atomic + exc_grad_w[3*iB + 0] += exc_grad_w_iBx; + #pragma omp atomic + exc_grad_w[3*iB + 1] += exc_grad_w_iBy; + #pragma omp atomic + exc_grad_w[3*iB + 2] += exc_grad_w_iBz; + // Use translational invariance to calculate the derivative for the parent atom + #pragma omp atomic + exc_grad_w[3*iParent + 0] -= exc_grad_w_iBx; + #pragma omp atomic + exc_grad_w[3*iParent + 1] -= exc_grad_w_iBy; + #pragma omp atomic + exc_grad_w[3*iParent + 2] -= exc_grad_w_iBz; + + } + } + +} + + +void reference_ssf_weights_1std_contraction_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + const double* w_times_f, + double* exc_grad_w +){ + + const double safe_magic_ssf_bound = integrator::magic_ssf_factor<> - 1.e-4; + const double w_times_f_thresh = 1.e-12; + const double weight_tol = integrator::ssf_weight_tol; + + auto gFrisch = [&](double x) { + const double s_x = x / integrator::magic_ssf_factor<>; + const double s_x2 = s_x * s_x; + const double s_x3 = s_x * s_x2; + const double s_x5 = s_x3 * s_x2; + const double s_x7 = s_x5 * s_x2; + + return (35.*(s_x - s_x3) + 21.*s_x5 - 5.*s_x7) / 16.; + }; + + auto tFrisch = [&](double x) { + const double s_x = x / integrator::magic_ssf_factor<>; + const double s_x2 = s_x * s_x; + const double s_x3 = s_x * s_x2; + const double numerator = (35.) * (s_x3 + (3.) * s_x2 + (3.) * s_x + (1.)); + const double denominator = (x - integrator::magic_ssf_factor<>) * ((5.)*s_x3 + (20.)*s_x2 + (29.)*s_x + (16.)); + return numerator / denominator ; + }; + + const size_t natoms = mol.natoms(); + const auto& RAB = meta.rab(); + std::vector partitionScratch( natoms ); + std::vector atomDist( natoms ); + + for( size_t i = 0; i < task.points.size(); ++i ) { + const auto& w_times_f_i = w_times_f[i]; + if (fabs(w_times_f_i) < w_times_f_thresh) continue; // weight derivative = 0 when p_A = 0 + const size_t iParent = task.iParent; + const auto& point = task.points[i]; + + const auto dist_cutoff = 0.5 * (1-integrator::magic_ssf_factor<>) * task.dist_nearest; + + // Compute dist to parent atom + { + const double da_x = point[0] - mol[iParent].x; + const double da_y = point[1] - mol[iParent].y; + const double da_z = point[2] - mol[iParent].z; + + atomDist[iParent] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); + } + + if( atomDist[iParent] < dist_cutoff ) continue; // weight derivative = 0 when p_A = 1 + + // Compute distances of each center to point + for(size_t iA = 0; iA < natoms; iA++) { + + if( iA == iParent ) continue; + + const double da_x = point[0] - mol[iA].x; + const double da_y = point[1] - mol[iA].y; + const double da_z = point[2] - mol[iA].z; + + atomDist[iA] = std::sqrt(da_x*da_x + da_y*da_y + da_z*da_z); + + } + + // Evaluate unnormalized partition functions + std::fill(partitionScratch.begin(),partitionScratch.end(),1.); + + for( size_t iA = 0; iA < natoms; iA++ ) + for( size_t jA = 0; jA < iA; jA++ ) + if( partitionScratch[iA] > weight_tol or + partitionScratch[jA] > weight_tol ) { + + const double mu = (atomDist[iA] - atomDist[jA]) / RAB[jA + iA*natoms]; + + if( mu <= -integrator::magic_ssf_factor<> ) { + + partitionScratch[jA] = 0.; + + } else if (mu >= integrator::magic_ssf_factor<>) { + + partitionScratch[iA] = 0.; + + } else { + + double g = 0.5 * ( 1. - gFrisch(mu) ); + partitionScratch[iA] *= g; + partitionScratch[jA] *= 1. - g; + + } + + } + + double sum = 0.; + for( size_t iA = 0; iA < natoms; iA++ ) sum += partitionScratch[iA]; + + // calculate derivative now + for( size_t iB = 0; iB < natoms; iB++ ) { + if (iB == iParent) continue; + double exc_grad_w_iBx = 0.0, exc_grad_w_iBy = 0.0, exc_grad_w_iBz = 0.0; + + const double rAB = RAB[iB + iParent*natoms]; + double rAB_inv = 1.0 / rAB; + double mu_AB = (atomDist[iParent] - atomDist[iB]) * rAB_inv ; + if( fabs(mu_AB) < safe_magic_ssf_bound) { + const double uB_x = mol[iB].x - point[0]; + const double uB_y = mol[iB].y - point[1]; + const double uB_z = mol[iB].z - point[2]; + + const double uBA_x =mol[iB].x - mol[iParent].x; + const double uBA_y =mol[iB].y - mol[iParent].y; + const double uBA_z =mol[iB].z - mol[iParent].z; + + // first term is - coef1 * nabla_B mu_BA + double coef1 = tFrisch(mu_AB) / rAB * (partitionScratch[iParent]-sum)/sum * w_times_f_i / atomDist[iB]; + exc_grad_w_iBx = coef1 * (uB_x + mu_AB * uBA_x * rAB_inv * atomDist[iB]); + exc_grad_w_iBy = coef1 * (uB_y + mu_AB * uBA_y * rAB_inv * atomDist[iB]); + exc_grad_w_iBz = coef1 * (uB_z + mu_AB * uBA_z * rAB_inv * atomDist[iB]); + } + + if (partitionScratch[iB] > weight_tol){ + for( size_t iC = 0; iC < natoms; iC++ ){ + if (iB == iC) continue; + const double rBC = RAB[iC + iB*natoms]; + double mu_BC = (atomDist[iB] - atomDist[iC]) / rBC; + if(fabs(mu_BC) < safe_magic_ssf_bound){ + double t_BC = tFrisch(mu_BC); + double coef = partitionScratch[iB] * t_BC / rBC/ sum * w_times_f_i; + + exc_grad_w_iBx -= coef * ((mol[iB].x - point[0]) / atomDist[iB] - mu_BC * (mol[iB].x - mol[iC].x) / rBC); + exc_grad_w_iBy -= coef * ((mol[iB].y - point[1]) / atomDist[iB] - mu_BC * (mol[iB].y - mol[iC].y) / rBC); + exc_grad_w_iBz -= coef * ((mol[iB].z - point[2]) / atomDist[iB] - mu_BC * (mol[iB].z - mol[iC].z) / rBC); + + if(iC != iParent) { + + double C_x = coef * ((mol[iC].x - point[0]) / atomDist[iC] + mu_BC * (mol[iC].x - mol[iB].x) / rBC); + double C_y = coef * ((mol[iC].y - point[1]) / atomDist[iC] + mu_BC * (mol[iC].y - mol[iB].y) / rBC); + double C_z = coef * ((mol[iC].z - point[2]) / atomDist[iC] + mu_BC * (mol[iC].z - mol[iB].z) / rBC); + // Update exc_grad_w_iC + #pragma omp atomic + exc_grad_w[3*iC + 0] += C_x; + #pragma omp atomic + exc_grad_w[3*iC + 1] += C_y; + #pragma omp atomic + exc_grad_w[3*iC + 2] += C_z; + // Update exc_grad_w for the parent atom + #pragma omp atomic + exc_grad_w[3*iParent + 0] -= C_x; + #pragma omp atomic + exc_grad_w[3*iParent + 1] -= C_y; + #pragma omp atomic + exc_grad_w[3*iParent + 2] -= C_z; + } + + } + } + } + + #pragma omp atomic + exc_grad_w[3*iB + 0] += exc_grad_w_iBx; + #pragma omp atomic + exc_grad_w[3*iB + 1] += exc_grad_w_iBy; + #pragma omp atomic + exc_grad_w[3*iB + 2] += exc_grad_w_iBz; + // Use translational invariance to calculate the derivative for the parent atom + #pragma omp atomic + exc_grad_w[3*iParent + 0] -= exc_grad_w_iBx; + #pragma omp atomic + exc_grad_w[3*iParent + 1] -= exc_grad_w_iBy; + #pragma omp atomic + exc_grad_w[3*iParent + 2] -= exc_grad_w_iBz; + + } + } + +} + + + } diff --git a/src/xc_integrator/local_work_driver/host/reference/weights.hpp b/src/xc_integrator/local_work_driver/host/reference/weights.hpp index f2f8c46d..7b79a156 100644 --- a/src/xc_integrator/local_work_driver/host/reference/weights.hpp +++ b/src/xc_integrator/local_work_driver/host/reference/weights.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -32,4 +36,36 @@ void reference_lko_weights_host( task_iterator task_end ); +void reference_becke_weights_1st_derivative_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + double* weight_deri +); + +void reference_ssf_weights_1st_derivative_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + double* weight_deri +); + +// Becke weights 1st derivative contracted with integrator +void reference_becke_weights_1std_contraction_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + const double* w_times_f, + double* exc_grad_w +); + +// SSF weights 1st derivative contracted with integrator +void reference_ssf_weights_1std_contraction_host( + const Molecule& mol, + const MolMeta& meta, + const XCTask& task, + const double* w_times_f, + double* exc_grad_w +); + } diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx index c83d3800..192cfcd3 100644 --- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx +++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -52,6 +56,21 @@ namespace GauXC { } } + void ReferenceLocalHostWorkDriver::eval_weight_1st_deriv_contracted( + XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, + const XCTask& task, const double* w_times_f, double* exc_grad_w ) { + switch( weight_alg ) { + case XCWeightAlg::Becke: + reference_becke_weights_1std_contraction_host( mol, meta, task, w_times_f, exc_grad_w ); + break; + case XCWeightAlg::SSF: + reference_ssf_weights_1std_contraction_host( mol, meta, task, w_times_f, exc_grad_w ); + break; + default: + GAUXC_GENERIC_EXCEPTION("Weight Alg Not Supported"); + } + } + // Collocation void ReferenceLocalHostWorkDriver::eval_collocation( size_t npts, size_t nshells, @@ -1028,6 +1047,541 @@ void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_gks( size_t npts, size_t nb } +void ReferenceLocalHostWorkDriver::eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* trho, double* A){ + for( int32_t i = 0; i < (int32_t)npts; ++i ) + A[i] = v2rho2[i] * trho[i]; +} + +void ReferenceLocalHostWorkDriver::eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* A){ + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + A[2*i] = v2rho2[3*i] * trho[2*i] + v2rho2[3*i+1] * trho[2*i+1]; + A[2*i+1] = v2rho2[3*i+1] * trho[2*i] + v2rho2[3*i+2] * trho[2*i+1]; + } +} + +void ReferenceLocalHostWorkDriver::eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ){ + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + + //calculate trial gamma + const auto tgamma = tdden_x_eval[i] * dden_x_eval[i] + tdden_y_eval[i] * dden_y_eval[i] + tdden_z_eval[i] * dden_z_eval[i]; + + A[i] = v2rho2[i] * trho[i] + 2 * v2rhogamma[i] * tgamma; + + auto B_coef = v2rhogamma[i] * trho[i] + 2 * v2gamma2[i] * tgamma; + + B[i * 3] = 2 * B_coef * dden_x_eval[i] + 2 * vgamma[i] * tdden_x_eval[i]; + B[i * 3 + 1] = 2 * B_coef * dden_y_eval[i] + 2 * vgamma[i] * tdden_y_eval[i]; + B[i * 3 + 2] = 2 * B_coef * dden_z_eval[i] + 2 * vgamma[i] * tdden_z_eval[i]; + + } +} + + +void ReferenceLocalHostWorkDriver::eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ){ + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + // convert dden_x_eval, dden_y_eval, dden_z_eval to two-spinor representation + const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]); + const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]); + const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]); + const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]); + const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]); + const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]); + // convert tdden_x_eval, tdden_y_eval, tdden_z_eval to two-spinor representation + const auto tdden_x_eval_a = 0.5 * (tdden_x_eval[2*i] + tdden_x_eval[2*i+1]); + const auto tdden_x_eval_b = 0.5 * (tdden_x_eval[2*i] - tdden_x_eval[2*i+1]); + const auto tdden_y_eval_a = 0.5 * (tdden_y_eval[2*i] + tdden_y_eval[2*i+1]); + const auto tdden_y_eval_b = 0.5 * (tdden_y_eval[2*i] - tdden_y_eval[2*i+1]); + const auto tdden_z_eval_a = 0.5 * (tdden_z_eval[2*i] + tdden_z_eval[2*i+1]); + const auto tdden_z_eval_b = 0.5 * (tdden_z_eval[2*i] - tdden_z_eval[2*i+1]); + + //calculate trial gamma + const auto tgamma_aa = tdden_x_eval_a * dden_x_eval_a + tdden_y_eval_a * dden_y_eval_a + tdden_z_eval_a * dden_z_eval_a; + const auto tgamma_ab = tdden_x_eval_a * dden_x_eval_b + tdden_y_eval_a * dden_y_eval_b + tdden_z_eval_a * dden_z_eval_b + + tdden_x_eval_b * dden_x_eval_a + tdden_y_eval_b * dden_y_eval_a + tdden_z_eval_b * dden_z_eval_a; + const auto tgamma_bb = tdden_x_eval_b * dden_x_eval_b + tdden_y_eval_b * dden_y_eval_b + tdden_z_eval_b * dden_z_eval_b; + const auto trho_a = trho[2*i]; + const auto trho_b = trho[2*i+1]; + + const auto v2rho2_a_a = v2rho2[3*i]; + const auto v2rho2_a_b = v2rho2[3*i+1]; + const auto v2rho2_b_b = v2rho2[3*i+2]; + const auto v2rhogamma_a_aa = v2rhogamma[6*i]; + const auto v2rhogamma_a_ab = v2rhogamma[6*i+1]; + const auto v2rhogamma_a_bb = v2rhogamma[6*i+2]; + const auto v2rhogamma_b_aa = v2rhogamma[6*i+3]; + const auto v2rhogamma_b_ab = v2rhogamma[6*i+4]; + const auto v2rhogamma_b_bb = v2rhogamma[6*i+5]; + const auto v2gamma2_aa_aa = v2gamma2[6*i]; + const auto v2gamma2_aa_ab = v2gamma2[6*i+1]; + const auto v2gamma2_aa_bb = v2gamma2[6*i+2]; + const auto v2gamma2_ab_ab = v2gamma2[6*i+3]; + const auto v2gamma2_ab_bb = v2gamma2[6*i+4]; + const auto v2gamma2_bb_bb = v2gamma2[6*i+5]; + const auto vgamma_aa = vgamma[3*i]; + const auto vgamma_ab = vgamma[3*i+1]; + const auto vgamma_bb = vgamma[3*i+2]; + + A[2 * i] = v2rho2_a_a * trho_a + 2 * v2rhogamma_a_aa * tgamma_aa + v2rhogamma_a_ab * tgamma_ab + + v2rho2_a_b * trho_b + 2 * v2rhogamma_a_bb * tgamma_bb; + A[2 * i + 1] = v2rho2_b_b * trho_b + 2 * v2rhogamma_b_bb * tgamma_bb + v2rhogamma_b_ab * tgamma_ab + + v2rho2_a_b * trho_a + 2 * v2rhogamma_b_aa * tgamma_aa; + + auto B_coef1 = v2rhogamma_a_aa * trho_a + 2 * v2gamma2_aa_aa * tgamma_aa + v2gamma2_aa_ab * tgamma_ab + + v2rhogamma_b_aa * trho_b + 2 * v2gamma2_aa_bb * tgamma_bb; + auto B_coef2 = v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa + v2gamma2_ab_ab * tgamma_ab + + v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb; + + B[i * 6] = 2 * B_coef1 * dden_x_eval_a + B_coef2 * dden_x_eval_b + 2 * vgamma_aa * tdden_x_eval_a + vgamma_ab * tdden_x_eval_b; + B[i * 6 + 1] = 2 * B_coef1 * dden_y_eval_a + B_coef2 * dden_y_eval_b + 2 * vgamma_aa * tdden_y_eval_a + vgamma_ab * tdden_y_eval_b; + B[i * 6 + 2] = 2 * B_coef1 * dden_z_eval_a + B_coef2 * dden_z_eval_b + 2 * vgamma_aa * tdden_z_eval_a + vgamma_ab * tdden_z_eval_b; + + B_coef1 = v2rhogamma_b_bb * trho_b + 2 * v2gamma2_bb_bb * tgamma_bb + v2gamma2_ab_bb * tgamma_ab + + v2rhogamma_a_bb * trho_a + 2 * v2gamma2_aa_bb * tgamma_aa; + B_coef2 = v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb + v2gamma2_ab_ab * tgamma_ab + + v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa; + + B[i * 6 + 3] = 2 * B_coef1 * dden_x_eval_b + B_coef2 * dden_x_eval_a + 2 * vgamma_bb * tdden_x_eval_b + vgamma_ab * tdden_x_eval_a; + B[i * 6 + 4] = 2 * B_coef1 * dden_y_eval_b + B_coef2 * dden_y_eval_a + 2 * vgamma_bb * tdden_y_eval_b + vgamma_ab * tdden_y_eval_a; + B[i * 6 + 5] = 2 * B_coef1 * dden_z_eval_b + B_coef2 * dden_z_eval_a + 2 * vgamma_bb * tdden_z_eval_b + vgamma_ab * tdden_z_eval_a; + } +} + + +void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau, + const double* v2lapl2, const double* v2lapltau, const double* v2tau2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C){ + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + //calculate trial gamma + const auto tgamma = tdden_x_eval[i] * dden_x_eval[i] + tdden_y_eval[i] * dden_y_eval[i] + tdden_z_eval[i] * dden_z_eval[i]; + + A[i] = v2rho2[i] * trho[i] + 2 * v2rhogamma[i] * tgamma + v2rhotau[i] * ttau[i]; + C[i] = v2rhotau[i] * trho[i] + 2 * v2gammatau[i] * tgamma + v2tau2[i] * ttau[i]; + + auto B_coef = v2rhogamma[i] * trho[i] + 2 * v2gamma2[i] * tgamma + v2gammatau[i] * ttau[i]; + + B[i * 3] = 2 * B_coef * dden_x_eval[i] + 2 * vgamma[i] * tdden_x_eval[i]; + B[i * 3 + 1] = 2 * B_coef * dden_y_eval[i] + 2 * vgamma[i] * tdden_y_eval[i]; + B[i * 3 + 2] = 2 * B_coef * dden_z_eval[i] + 2 * vgamma[i] * tdden_z_eval[i]; + + } + +} + + +void ReferenceLocalHostWorkDriver::eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau, + const double* v2lapl2, const double* v2lapltau, const double* v2tau2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C){ + + // Laplacian is not supported now + if( v2rholapl != nullptr || v2gammalapl != nullptr || v2lapltau != nullptr || v2lapl2 != nullptr ) + GAUXC_GENERIC_EXCEPTION(std::string("Laplacian not supported")); + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + // convert dden_x_eval, dden_y_eval, dden_z_eval to two-spinor representation + const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]); + const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]); + const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]); + const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]); + const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]); + const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]); + // convert tdden_x_eval, tdden_y_eval, tdden_z_eval to two-spinor representation + const auto tdden_x_eval_a = 0.5 * (tdden_x_eval[2*i] + tdden_x_eval[2*i+1]); + const auto tdden_x_eval_b = 0.5 * (tdden_x_eval[2*i] - tdden_x_eval[2*i+1]); + const auto tdden_y_eval_a = 0.5 * (tdden_y_eval[2*i] + tdden_y_eval[2*i+1]); + const auto tdden_y_eval_b = 0.5 * (tdden_y_eval[2*i] - tdden_y_eval[2*i+1]); + const auto tdden_z_eval_a = 0.5 * (tdden_z_eval[2*i] + tdden_z_eval[2*i+1]); + const auto tdden_z_eval_b = 0.5 * (tdden_z_eval[2*i] - tdden_z_eval[2*i+1]); + + //calculate trial gamma + const auto tgamma_aa = tdden_x_eval_a * dden_x_eval_a + tdden_y_eval_a * dden_y_eval_a + tdden_z_eval_a * dden_z_eval_a; + const auto tgamma_ab = tdden_x_eval_a * dden_x_eval_b + tdden_y_eval_a * dden_y_eval_b + tdden_z_eval_a * dden_z_eval_b + + tdden_x_eval_b * dden_x_eval_a + tdden_y_eval_b * dden_y_eval_a + tdden_z_eval_b * dden_z_eval_a; + const auto tgamma_bb = tdden_x_eval_b * dden_x_eval_b + tdden_y_eval_b * dden_y_eval_b + tdden_z_eval_b * dden_z_eval_b; + const auto trho_a = trho[2*i]; + const auto trho_b = trho[2*i+1]; + const auto ttau_a = ttau[2*i]; + const auto ttau_b = ttau[2*i+1]; + + const auto v2rho2_a_a = v2rho2[3*i]; + const auto v2rho2_a_b = v2rho2[3*i+1]; + const auto v2rho2_b_b = v2rho2[3*i+2]; + const auto v2rhogamma_a_aa = v2rhogamma[6*i]; + const auto v2rhogamma_a_ab = v2rhogamma[6*i+1]; + const auto v2rhogamma_a_bb = v2rhogamma[6*i+2]; + const auto v2rhogamma_b_aa = v2rhogamma[6*i+3]; + const auto v2rhogamma_b_ab = v2rhogamma[6*i+4]; + const auto v2rhogamma_b_bb = v2rhogamma[6*i+5]; + const auto v2gamma2_aa_aa = v2gamma2[6*i]; + const auto v2gamma2_aa_ab = v2gamma2[6*i+1]; + const auto v2gamma2_aa_bb = v2gamma2[6*i+2]; + const auto v2gamma2_ab_ab = v2gamma2[6*i+3]; + const auto v2gamma2_ab_bb = v2gamma2[6*i+4]; + const auto v2gamma2_bb_bb = v2gamma2[6*i+5]; + const auto vgamma_aa = vgamma[3*i]; + const auto vgamma_ab = vgamma[3*i+1]; + const auto vgamma_bb = vgamma[3*i+2]; + const auto v2rhotau_a_a = v2rhotau[4*i]; + const auto v2rhotau_a_b = v2rhotau[4*i+1]; + const auto v2rhotau_b_a = v2rhotau[4*i+2]; + const auto v2rhotau_b_b = v2rhotau[4*i+3]; + const auto v2tau2_a_a = v2tau2[3*i]; + const auto v2tau2_a_b = v2tau2[3*i+1]; + const auto v2tau2_b_b = v2tau2[3*i+2]; + const auto v2gammatau_aa_a = v2gammatau[6*i]; + const auto v2gammatau_aa_b = v2gammatau[6*i+1]; + const auto v2gammatau_ab_a = v2gammatau[6*i+2]; + const auto v2gammatau_ab_b = v2gammatau[6*i+3]; + const auto v2gammatau_bb_a = v2gammatau[6*i+4]; + const auto v2gammatau_bb_b = v2gammatau[6*i+5]; + + + A[2 * i] = v2rho2_a_a * trho_a + 2 * v2rhogamma_a_aa * tgamma_aa + v2rhogamma_a_ab * tgamma_ab + v2rhotau_a_a * ttau_a + + v2rho2_a_b * trho_b + 2 * v2rhogamma_a_bb * tgamma_bb + v2rhotau_a_b * ttau_b; + A[2 * i + 1] = v2rho2_b_b * trho_b + 2 * v2rhogamma_b_bb * tgamma_bb + v2rhogamma_b_ab * tgamma_ab + v2rhotau_b_b * ttau_b + + v2rho2_a_b * trho_a + 2 * v2rhogamma_b_aa * tgamma_aa + v2rhotau_b_a * ttau_a; + + C[2 * i] = v2rhotau_a_a * trho_a + 2 * v2gammatau_aa_a * tgamma_aa + v2gammatau_ab_a * tgamma_ab + v2tau2_a_a * ttau_a + + v2rhotau_b_a * trho_b + 2 * v2gammatau_bb_a * tgamma_bb + v2tau2_a_b * ttau_b; + C[2 * i + 1] = v2rhotau_b_b * trho_b + 2 * v2gammatau_bb_b * tgamma_bb + v2gammatau_ab_b * tgamma_ab + v2tau2_b_b * ttau_b + + v2rhotau_a_b * trho_a + 2 * v2gammatau_aa_b * tgamma_aa + v2tau2_a_b * ttau_a; + + auto B_coef1 = v2rhogamma_a_aa * trho_a + 2 * v2gamma2_aa_aa * tgamma_aa + v2gamma2_aa_ab * tgamma_ab + v2gammatau_aa_a * ttau_a + + v2rhogamma_b_aa * trho_b + 2 * v2gamma2_aa_bb * tgamma_bb + v2gammatau_aa_b * ttau_b; + auto B_coef2 = v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa + v2gamma2_ab_ab * tgamma_ab + v2gammatau_ab_a * ttau_a + + v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb + v2gammatau_ab_b * ttau_b; + + B[i * 6] = 2 * B_coef1 * dden_x_eval_a + B_coef2 * dden_x_eval_b + 2 * vgamma_aa * tdden_x_eval_a + vgamma_ab * tdden_x_eval_b; + B[i * 6 + 1] = 2 * B_coef1 * dden_y_eval_a + B_coef2 * dden_y_eval_b + 2 * vgamma_aa * tdden_y_eval_a + vgamma_ab * tdden_y_eval_b; + B[i * 6 + 2] = 2 * B_coef1 * dden_z_eval_a + B_coef2 * dden_z_eval_b + 2 * vgamma_aa * tdden_z_eval_a + vgamma_ab * tdden_z_eval_b; + + B_coef1 = v2rhogamma_b_bb * trho_b + 2 * v2gamma2_bb_bb * tgamma_bb + v2gamma2_ab_bb * tgamma_ab + v2gammatau_bb_b * ttau_b + + v2rhogamma_a_bb * trho_a + 2 * v2gamma2_aa_bb * tgamma_aa + v2gammatau_bb_a * ttau_a; + B_coef2 = v2rhogamma_b_ab * trho_b + 2 * v2gamma2_ab_bb * tgamma_bb + v2gamma2_ab_ab * tgamma_ab + v2gammatau_ab_b * ttau_b + + v2rhogamma_a_ab * trho_a + 2 * v2gamma2_aa_ab * tgamma_aa + v2gammatau_ab_a * ttau_a; + + B[i * 6 + 3] = 2 * B_coef1 * dden_x_eval_b + B_coef2 * dden_x_eval_a + 2 * vgamma_bb * tdden_x_eval_b + vgamma_ab * tdden_x_eval_a; + B[i * 6 + 4] = 2 * B_coef1 * dden_y_eval_b + B_coef2 * dden_y_eval_a + 2 * vgamma_bb * tdden_y_eval_b + vgamma_ab * tdden_y_eval_a; + B[i * 6 + 5] = 2 * B_coef1 * dden_z_eval_b + B_coef2 * dden_z_eval_a + 2 * vgamma_bb * tdden_z_eval_b + vgamma_ab * tdden_z_eval_a; + + } +} + + +// Eval Z Matrix LDA VXC for two-spinors +void ReferenceLocalHostWorkDriver::eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbf, + const double* vrho, const double* basis_eval, double* Za, size_t ldza, + double* Zb, size_t ldzb ) { + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb); + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + //eq. 56 https://doi.org/10.1140/epjb/e2018-90170-1 + GauXC::blas::scal( nbf, 0.5 * vrho[2*i], Za + i*ldza, 1 ); + GauXC::blas::scal( nbf, 0.5 * vrho[2*i+1], Zb + i*ldzb, 1 ); + } +} + +void ReferenceLocalHostWorkDriver::eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ){ + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + B[i*3] = 2 * vgamma[i] * dden_x_eval[i]; + B[i*3+1] = 2 * vgamma[i] * dden_y_eval[i]; + B[i*3+2] = 2 * vgamma[i]* dden_z_eval[i]; + } +} + +void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, + const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, double* Z, + size_t ldz) { + + if( ldz != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Z, ldz); + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + const int32_t ioff = i * nbf; + + auto* z_col = Z + ioff; + auto* bf_x_col = dbasis_x_eval + ioff; + auto* bf_y_col = dbasis_y_eval + ioff; + auto* bf_z_col = dbasis_z_eval + ioff; + + GauXC::blas::scal( nbf, 0.5*A[i], z_col, 1 ); + + blas::axpy( nbf, B[i*3], bf_x_col, 1, z_col, 1 ); + blas::axpy( nbf, B[i*3+1], bf_y_col, 1, z_col, 1 ); + blas::axpy( nbf, B[i*3+2], bf_z_col, 1, z_col, 1 ); + + } +} + + +void ReferenceLocalHostWorkDriver::eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ){ + + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + const auto gga_fact_aa = vgamma[3*i]; + const auto gga_fact_ab = vgamma[3*i+1]; + const auto gga_fact_bb = vgamma[3*i+2]; + + // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation + // so we need to convert them to the two spinor representation + const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]); + const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]); + const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]); + const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]); + const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]); + const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]); + + B[i*6] = 2 * gga_fact_aa * dden_x_eval_a + gga_fact_ab * dden_x_eval_b; + B[i*6+1] = 2 * gga_fact_aa * dden_y_eval_a + gga_fact_ab * dden_y_eval_b; + B[i*6+2] = 2 * gga_fact_aa * dden_z_eval_a + gga_fact_ab * dden_z_eval_b; + + B[i*6+3] = 2 * gga_fact_bb * dden_x_eval_b + gga_fact_ab * dden_x_eval_a; + B[i*6+4] = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a; + B[i*6+5] = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a; + } +} +void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, + const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, double* Za, + size_t ldza, double* Zb, size_t ldzb ) { + + + if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb); + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + const int32_t ioff = i * nbf; + + auto* za_col = Za + ioff; + auto* zb_col = Zb + ioff; + auto* bf_x_col = dbasis_x_eval + ioff; + auto* bf_y_col = dbasis_y_eval + ioff; + auto* bf_z_col = dbasis_z_eval + ioff; + + GauXC::blas::scal( nbf, 0.5*A[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. " + GauXC::blas::scal( nbf, 0.5*A[2*i+1], zb_col, 1 ); + + blas::axpy( nbf, B[i*6], bf_x_col, 1, za_col, 1 ); + blas::axpy( nbf, B[i*6+1], bf_y_col, 1, za_col, 1 ); + blas::axpy( nbf, B[i*6+2], bf_z_col, 1, za_col, 1 ); + + blas::axpy( nbf, B[i*6+3], bf_x_col, 1, zb_col, 1 ); + blas::axpy( nbf, B[i*6+4], bf_y_col, 1, zb_col, 1 ); + blas::axpy( nbf, B[i*6+5], bf_z_col, 1, zb_col, 1 ); + + } +} + + +void ReferenceLocalHostWorkDriver::eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, + const double* vrho, const double* vgamma, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, const double* dden_x_eval, + const double* dden_y_eval, const double* dden_z_eval, double* Za, + size_t ldza, double* Zb, size_t ldzb ) { + + + if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb); + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + const int32_t ioff = i * nbf; + + auto* za_col = Za + ioff; + auto* zb_col = Zb + ioff; + auto* bf_x_col = dbasis_x_eval + ioff; + auto* bf_y_col = dbasis_y_eval + ioff; + auto* bf_z_col = dbasis_z_eval + ioff; + + GauXC::blas::scal( nbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. " + GauXC::blas::scal( nbf, 0.5*vrho[2*i+1], zb_col, 1 ); + + const auto gga_fact_aa = vgamma[3*i]; + const auto gga_fact_ab = vgamma[3*i+1]; + const auto gga_fact_bb = vgamma[3*i+2]; + + // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation + // so we need to convert them to the two spinor representation + const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]); + const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]); + const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]); + const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]); + const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]); + const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]); + + const auto x_fact_a = 2 * gga_fact_aa * dden_x_eval_a + gga_fact_ab * dden_x_eval_b; + const auto y_fact_a = 2 * gga_fact_aa * dden_y_eval_a + gga_fact_ab * dden_y_eval_b; + const auto z_fact_a = 2 * gga_fact_aa * dden_z_eval_a + gga_fact_ab * dden_z_eval_b; + + const auto x_fact_b = 2 * gga_fact_bb * dden_x_eval_b + gga_fact_ab * dden_x_eval_a; + const auto y_fact_b = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a; + const auto z_fact_b = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a; + + blas::axpy( nbf, x_fact_a, bf_x_col, 1, za_col, 1 ); + blas::axpy( nbf, y_fact_a, bf_y_col, 1, za_col, 1 ); + blas::axpy( nbf, z_fact_a, bf_z_col, 1, za_col, 1 ); + + blas::axpy( nbf, x_fact_b, bf_x_col, 1, zb_col, 1 ); + blas::axpy( nbf, y_fact_b, bf_y_col, 1, zb_col, 1 ); + blas::axpy( nbf, z_fact_b, bf_z_col, 1, zb_col, 1 ); + + } +} + +void ReferenceLocalHostWorkDriver::eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbf, + const double* vrho, const double* vgamma, const double* vlapl, + const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, const double* lbasis_eval, + const double* dden_x_eval, + const double* dden_y_eval, const double* dden_z_eval, double* Za, + size_t ldza, double* Zb, size_t ldzb ) { + + if( ldza != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + if( ldzb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Za, ldza); + blas::lacpy( 'A', nbf, npts, basis_eval, nbf, Zb, ldzb); + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + const int32_t ioff = i * nbf; + + auto* za_col = Za + ioff; + auto* zb_col = Zb + ioff; + auto* bf_x_col = dbasis_x_eval + ioff; + auto* bf_y_col = dbasis_y_eval + ioff; + auto* bf_z_col = dbasis_z_eval + ioff; + auto* lbf_col = lbasis_eval + ioff; + + GauXC::blas::scal( nbf, 0.5*vrho[2*i], za_col, 1 ); //additional 0.5 is from eq 56 in petrone 2018 eur phys journal b "an efficent implementation of .. " + GauXC::blas::scal( nbf, 0.5*vrho[2*i+1], zb_col, 1 ); + + // dden_x_eval, dden_y_eval, dden_z_eval are all still in Pauli representation + // so we need to convert them to the two spinor representation + const auto dden_x_eval_a = 0.5 * (dden_x_eval[2*i] + dden_x_eval[2*i+1]); + const auto dden_x_eval_b = 0.5 * (dden_x_eval[2*i] - dden_x_eval[2*i+1]); + const auto dden_y_eval_a = 0.5 * (dden_y_eval[2*i] + dden_y_eval[2*i+1]); + const auto dden_y_eval_b = 0.5 * (dden_y_eval[2*i] - dden_y_eval[2*i+1]); + const auto dden_z_eval_a = 0.5 * (dden_z_eval[2*i] + dden_z_eval[2*i+1]); + const auto dden_z_eval_b = 0.5 * (dden_z_eval[2*i] - dden_z_eval[2*i+1]); + + const auto gga_fact_aa = vgamma[3*i]; + const auto gga_fact_ab = vgamma[3*i+1]; + const auto gga_fact_bb = vgamma[3*i+2]; + + const auto x_fact_a = 2 * gga_fact_aa * dden_x_eval_a + gga_fact_ab * dden_x_eval_b; + const auto y_fact_a = 2 * gga_fact_aa * dden_y_eval_a + gga_fact_ab * dden_y_eval_b; + const auto z_fact_a = 2 * gga_fact_aa * dden_z_eval_a + gga_fact_ab * dden_z_eval_b; + + const auto x_fact_b = 2 * gga_fact_bb * dden_x_eval_b + gga_fact_ab * dden_x_eval_a; + const auto y_fact_b = 2 * gga_fact_bb * dden_y_eval_b + gga_fact_ab * dden_y_eval_a; + const auto z_fact_b = 2 * gga_fact_bb * dden_z_eval_b + gga_fact_ab * dden_z_eval_a; + + blas::axpy( nbf, x_fact_a, bf_x_col, 1, za_col, 1 ); + blas::axpy( nbf, y_fact_a, bf_y_col, 1, za_col, 1 ); + blas::axpy( nbf, z_fact_a, bf_z_col, 1, za_col, 1 ); + + blas::axpy( nbf, x_fact_b, bf_x_col, 1, zb_col, 1 ); + blas::axpy( nbf, y_fact_b, bf_y_col, 1, zb_col, 1 ); + blas::axpy( nbf, z_fact_b, bf_z_col, 1, zb_col, 1 ); + + if (vlapl != nullptr) { + blas::axpy( nbf, vlapl[2*i], lbf_col, 1, za_col, 1); + blas::axpy( nbf, vlapl[2*i + 1], lbf_col, 1, zb_col, 1); + } + + } +} +void ReferenceLocalHostWorkDriver::eval_mmat_mgga_vxc_uks_ts(size_t npts, size_t nbf, + const double* vtau, const double* vlapl, + const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, + double* mmat_xa, double* mmat_ya, double* mmat_za, size_t ldma, + double* mmat_xb, double* mmat_yb, double* mmat_zb, size_t ldmb) { + + if( ldma != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + if( ldmb != nbf ) GAUXC_GENERIC_EXCEPTION(std::string("Invalid Dims")); + + blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xa, ldma); + blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_ya, ldma); + blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_za, ldma); + blas::lacpy( 'A', nbf, npts, dbasis_x_eval, nbf, mmat_xb, ldmb); + blas::lacpy( 'A', nbf, npts, dbasis_y_eval, nbf, mmat_yb, ldmb); + blas::lacpy( 'A', nbf, npts, dbasis_z_eval, nbf, mmat_zb, ldmb); + + for( int32_t i = 0; i < (int32_t)npts; ++i ) { + + const int32_t ioff = i * nbf; + auto* xa_col = mmat_xa + ioff; + auto* ya_col = mmat_ya + ioff; + auto* za_col = mmat_za + ioff; + auto* xb_col = mmat_xb + ioff; + auto* yb_col = mmat_yb + ioff; + auto* zb_col = mmat_zb + ioff; + auto* bf_x_col = dbasis_x_eval + ioff; + auto* bf_y_col = dbasis_y_eval + ioff; + auto* bf_z_col = dbasis_z_eval + ioff; + + const auto tfacta = 0.25 * vtau[2*i]; + const auto tfactb = 0.25 * vtau[2*i+1]; + + blas::scal( nbf, tfacta, xa_col, 1); + blas::scal( nbf, tfacta, ya_col, 1); + blas::scal( nbf, tfacta, za_col, 1); + blas::scal( nbf, tfactb, xb_col, 1); + blas::scal( nbf, tfactb, yb_col, 1); + blas::scal( nbf, tfactb, zb_col, 1); + + if ( vlapl != nullptr ) { + const auto lfacta = vlapl[2*i]; + const auto lfactb = vlapl[2*i+1]; + blas::axpy( nbf, lfacta, bf_x_col, 1, xa_col, 1); + blas::axpy( nbf, lfacta, bf_y_col, 1, ya_col, 1); + blas::axpy( nbf, lfacta, bf_z_col, 1, za_col, 1); + blas::axpy( nbf, lfactb, bf_x_col, 1, xb_col, 1); + blas::axpy( nbf, lfactb, bf_y_col, 1, yb_col, 1); + blas::axpy( nbf, lfactb, bf_z_col, 1, zb_col, 1); + } + + } +} + + + + + + // Increment VXC by Z void ReferenceLocalHostWorkDriver::inc_vxc( size_t npts, size_t nbf, size_t nbe, const double* basis_eval, const submat_map_t& submat_map, const double* Z, diff --git a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp index 1f0d730b..3560b85b 100644 --- a/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp +++ b/src/xc_integrator/local_work_driver/host/reference_local_host_work_driver.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -30,6 +34,9 @@ struct ReferenceLocalHostWorkDriver : public detail::LocalHostWorkDriverPIMPL { void partition_weights( XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, task_iterator task_begin, task_iterator task_end ) override; + void eval_weight_1st_deriv_contracted( XCWeightAlg weight_alg, const Molecule& mol, + const MolMeta& meta, const XCTask& task, const double* w_times_f, double* exc_grad_w ) override; + void eval_collocation( size_t npts, size_t nshells, size_t nbe, const double* pts, const BasisSet& basis, const int32_t* shell_list, double* basis_eval ) override; @@ -174,6 +181,61 @@ struct ReferenceLocalHostWorkDriver : public detail::LocalHostWorkDriverPIMPL { const double* basis_eval, const submat_map_t& submat_map, const double* Z, size_t ldz, double* VXC, size_t ldvxc, double* scr ) override; + + void eval_tmat_lda_vxc_rks( size_t npts, const double* v2rho2, const double* tden_eval, double* A) override; + void eval_tmat_lda_vxc_uks( size_t npts, const double* v2rho2, const double* trho, double* tmat) override; + + void eval_tmat_gga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) override; + void eval_tmat_gga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2gamma2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B ) override; + + void eval_tmat_mgga_vxc_rks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gammatau, + const double* v2lapl2, const double* v2lapltau, const double* v2tau2, + const double* tden_eval, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) override; + void eval_tmat_mgga_vxc_uks( size_t npts, const double* vgamma, + const double* v2rho2, const double* v2rhogamma, const double* v2rholapl, const double* v2rhotau, + const double* v2gamma2, const double* v2gammalapl, const double* v2gamma_tau, + const double* v2lapl2, const double* v2tau_lapl, const double* v2tau2, + const double* trho, const double* tdden_x_eval, const double* tdden_y_eval, const double* tdden_z_eval, const double* ttau, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* A, double* B, double* C) override; + + void eval_zmat_lda_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* basis_eval, double* Za, size_t ldza, double* Zb, size_t ldzb ) override; + void eval_Bvec_gga_vxc_uks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) override; + void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ) override; + void eval_Bvec_gga_vxc_rks_ts( size_t npts, const double* vgamma, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, double* B ) override; + void eval_zmat_gga_vxc_rks_ts( size_t npts, size_t nbf, const double* A, const double* B, const double* basis_eval, + const double* dbasis_x_eval, const double* dbasis_y_eval, const double* dbasis_z_eval, + double* Z, size_t ldz ) override; + + void eval_zmat_gga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* vgamma, const double* basis_eval, const double* dbasis_x_eval, + const double* dbasis_y_eval, const double* dbasis_z_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ) override; + void eval_zmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vrho, + const double* vgamma, const double* vlapl, + const double* basis_eval, const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, const double* lbasis_eval, + const double* dden_x_eval, const double* dden_y_eval, const double* dden_z_eval, + double* Za, size_t ldza, double* Zb, size_t ldzb ) override; + void eval_mmat_mgga_vxc_uks_ts( size_t npts, size_t nbe, const double* vtau, + const double* vlapl, const double* dbasis_x_eval, const double* dbasis_y_eval, + const double* dbasis_z_eval, double* mmat_xs, double* mmat_ys, double* mmat_zs, + size_t ldms, double* mmat_xz, double* mmat_yz, double* mmat_zz, size_t ldmz ) override; }; + } diff --git a/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt b/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt index c2a15705..00cd6536 100644 --- a/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt +++ b/src/xc_integrator/local_work_driver/host/rys/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx b/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx index a0b3a95d..0877f2e4 100644 --- a/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx +++ b/src/xc_integrator/local_work_driver/host/rys/cheby_boys.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp b/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp index 021f4e81..f9a76814 100644 --- a/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp +++ b/src/xc_integrator/local_work_driver/host/rys/scripts/rys_kernel_template.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/local_work_driver/host/util.hpp b/src/xc_integrator/local_work_driver/host/util.hpp index f3de07d0..269234c2 100644 --- a/src/xc_integrator/local_work_driver/host/util.hpp +++ b/src/xc_integrator/local_work_driver/host/util.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/CMakeLists.txt b/src/xc_integrator/replicated/CMakeLists.txt index b8d12995..4b242ec5 100644 --- a/src/xc_integrator/replicated/CMakeLists.txt +++ b/src/xc_integrator/replicated/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/replicated/device/CMakeLists.txt b/src/xc_integrator/replicated/device/CMakeLists.txt index 0d789eff..9271fc73 100644 --- a/src/xc_integrator/replicated/device/CMakeLists.txt +++ b/src/xc_integrator/replicated/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx index c39632c0..ff64d58a 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -10,6 +14,8 @@ #include "incore_replicated_xc_device_integrator_exc_vxc.hpp" #include "incore_replicated_xc_device_integrator_exc_grad.hpp" #include "incore_replicated_xc_device_integrator_exx.hpp" +#include "incore_replicated_xc_device_integrator_fxc_contraction.hpp" +#include "incore_replicated_xc_device_integrator_dd.hpp" namespace GauXC { namespace detail { diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp index 30403175..30ff47ce 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -70,13 +74,36 @@ class IncoreReplicatedXCDeviceIntegrator : value_type* EXC, const IntegratorSettingsXC& settings ) override; - void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) override; + void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, + value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override; + void eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override; void eval_exx_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* K, int64_t ldk, const IntegratorSettingsEXX& settings ) override; + void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ) override; + + void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ) override; + + void eval_dd_psi_( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, value_type* ddPsi, + int64_t ldPsi ) override; + + void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, + unsigned max_Ylm, value_type* Vddx ) override; void integrate_den_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, value_type *N_EL, @@ -102,14 +129,32 @@ class IncoreReplicatedXCDeviceIntegrator : host_task_iterator task_begin, host_task_iterator task_end, XCDeviceData& device_data ); - void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, + void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data); + + void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type *N_EL, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ); + + void eval_exc_grad_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ); + XCDeviceData& device_data, const IntegratorSettingsXC& settings ); - void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P, - int64_t ldp, value_type* EXC_GRAD, + void eval_exc_grad_local_work_( const basis_type& basis, const value_type* P, int64_t ldp, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ); + XCDeviceData& device_data, const IntegratorSettingsXC& settings ); diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_dd.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_dd.hpp new file mode 100644 index 00000000..4898fa0c --- /dev/null +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_dd.hpp @@ -0,0 +1,35 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "incore_replicated_xc_device_integrator.hpp" +#include +#include + +namespace GauXC::detail { + + template + void IncoreReplicatedXCDeviceIntegrator:: + eval_dd_psi_( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) { + GAUXC_GENERIC_EXCEPTION("Device DD-PSI NYI"); + util::unused(m,n,P,ldp,max_Ylm,ddPsi,ldPsi); + } + + template + void IncoreReplicatedXCDeviceIntegrator:: + eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, + unsigned max_Ylm, value_type* Vddx ) { + GAUXC_GENERIC_EXCEPTION("Device DD-PHIX NYI"); + util::unused(m,n,X,max_Ylm,Vddx); + } + +} diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp index a8c32da3..9a2a7cf4 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp index a49eee7f..6c030bc2 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_grad.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -17,7 +21,7 @@ namespace detail { template void IncoreReplicatedXCDeviceIntegrator:: eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) { + int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& settings) { const auto& basis = this->load_balancer_->basis(); @@ -49,8 +53,65 @@ void IncoreReplicatedXCDeviceIntegrator:: // Compute local contributions to EXC Gradient and retrieve // data from device this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - eval_exc_grad_local_work_( basis, P, ldp, EXC_GRAD, tasks.begin(), - tasks.end(), *device_data_ptr ); + eval_exc_grad_local_work_( basis, P, ldp, nullptr, 0, EXC_GRAD, tasks.begin(), + tasks.end(), *device_data_ptr, settings ); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + this->timer_.time_op("XCIntegrator.Allreduce", [&](){ + this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, + ReductionOp::Sum ); + }); + + } + +} + + +template +void IncoreReplicatedXCDeviceIntegrator:: + eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { + + const auto& basis = this->load_balancer_->basis(); + + // Check that P is sane + const int64_t nbf = basis.nbf(); + if( m != n ) + GAUXC_GENERIC_EXCEPTION("P Must Be Square"); + if( m != nbf ) + GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis"); + if( ldps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPS"); + if( ldpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPZ"); + + // Get Tasks + auto& tasks = this->load_balancer_->get_tasks(); + + // Allocate Device memory + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto device_data_ptr = + this->timer_.time_op("XCIntegrator.DeviceAlloc", + [&](){ return lwd->create_device_data(rt); }); + + const auto& mol = this->load_balancer_->molecule(); + const auto natoms = mol.size(); + if( this->reduction_driver_->takes_device_memory() ) { + GAUXC_GENERIC_EXCEPTION("Device Reduction + EXC Grad NYI"); + } else { + + // Compute local contributions to EXC Gradient and retrieve + // data from device + this->timer_.time_op("XCIntegrator.LocalWork", [&](){ + eval_exc_grad_local_work_( basis, Ps, ldps, Pz, ldpz, EXC_GRAD, tasks.begin(), + tasks.end(), *device_data_ptr, settings ); }); GAUXC_MPI_CODE( @@ -71,15 +132,25 @@ void IncoreReplicatedXCDeviceIntegrator:: template void IncoreReplicatedXCDeviceIntegrator:: eval_exc_grad_local_work_( const basis_type& basis, - const value_type* P, int64_t ldp, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ) { + XCDeviceData& device_data, const IntegratorSettingsXC& settings ) { + + const bool is_uks = Pz != nullptr; + const bool is_rks = not is_uks; auto* lwd = dynamic_cast(this->local_work_driver_.get() ); // Setup Aliases const auto& func = *this->func_; const auto& mol = this->load_balancer_->molecule(); + const auto& meta = this->load_balancer_->molmeta(); + + // Sanity gates + if(func.needs_laplacian()) { + GAUXC_GENERIC_EXCEPTION("Device EXC Gradients + Laplacian Dependent MGGAs Not Yet Implemented"); + } // Get basis map BasisSetMap basis_map(basis,mol); @@ -93,33 +164,47 @@ void IncoreReplicatedXCDeviceIntegrator:: }; std::sort( task_begin, task_end, task_comparator ); - - + // Misc KS settings + IntegratorSettingsEXC_GRAD exc_grad_settings; + if( auto* tmp = dynamic_cast(&settings) ) { + exc_grad_settings = *tmp; + } // Check that Partition Weights have been calculated auto& lb_state = this->load_balancer_->state(); if( not lb_state.modified_weights_are_stored ) { GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); } + XCWeightAlg& weight_alg = lb_state.weight_alg; + + + // Processes batches in groups that saturadate available device memory + integrator_term_tracker enabled_terms; + enabled_terms.exc_grad = true; + enabled_terms.weights = true; + + if (is_rks) enabled_terms.ks_scheme = RKS; + else if (is_uks) enabled_terms.ks_scheme = UKS; + + if( func.is_lda() ) enabled_terms.xc_approx = integrator_xc_approx::LDA; + else if( func.is_gga() ) enabled_terms.xc_approx = integrator_xc_approx::GGA; + else if( func.needs_laplacian() ) enabled_terms.xc_approx = integrator_xc_approx::MGGA_LAPL; + else enabled_terms.xc_approx = integrator_xc_approx::MGGA_TAU; // Do XC integration in task batches const auto nbf = basis.nbf(); const auto nshells = basis.nshells(); const auto natoms = mol.size(); device_data.reset_allocations(); - device_data.allocate_static_data_exc_grad( nbf, nshells, natoms ); - device_data.send_static_data_density_basis( P, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis ); + device_data.allocate_static_data_exc_grad( nbf, nshells, natoms, enabled_terms ); + device_data.send_static_data_density_basis( Ps, ldps, Pz, ldpz, nullptr, 0, nullptr, 0, basis ); + // for weight contribution + device_data.allocate_static_data_weights( natoms ); + device_data.send_static_data_weights( mol, meta ); // Zero integrands device_data.zero_exc_grad_integrands(); - // Processes batches in groups that saturadate available device memory - integrator_term_tracker enabled_terms; - enabled_terms.exc_grad = true; - enabled_terms.ks_scheme = RKS; - if( func.is_lda() ) enabled_terms.xc_approx = integrator_xc_approx::LDA; - else if( func.is_gga() ) enabled_terms.xc_approx = integrator_xc_approx::GGA; - else GAUXC_GENERIC_EXCEPTION("XC Approx NYI"); auto task_it = task_begin; while( task_it != task_end ) { @@ -131,30 +216,51 @@ void IncoreReplicatedXCDeviceIntegrator:: /*** Process the batches ***/ // Evaluate collocation - if( func.is_gga() ) lwd->eval_collocation_hessian ( &device_data ); - else lwd->eval_collocation_gradient( &device_data ); - - // Evaluate X matrix - const bool do_xmat_grad = func.is_gga(); - lwd->eval_xmat( 2.0, &device_data, do_xmat_grad, DEN_S ); - - // Evaluate V variable - lwd->eval_vvar( &device_data, DEN_S, do_xmat_grad ); + if( func.needs_laplacian() ) lwd->eval_collocation_lapgrad ( &device_data ); + else if( !func.is_lda() ) lwd->eval_collocation_hessian ( &device_data ); + else lwd->eval_collocation_gradient( &device_data ); + + // Evaluate X matrix and V vars + const auto xmat_fac = is_rks ? 2.0 : 1.0; + const auto need_lapl = func.needs_laplacian(); + const auto need_xmat_grad = not func.is_lda(); + auto do_xmat_vvar = [&](density_id den_id) { + lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id ); + if(func.is_lda()) lwd->eval_vvars_lda( &device_data, den_id ); + else if(func.is_gga()) lwd->eval_vvars_gga( &device_data, den_id ); + else lwd->eval_vvars_mgga( &device_data, den_id, need_lapl ); + + // Save XMat for EXC gradient assembly + if(is_uks) lwd->save_xmat( &device_data, need_xmat_grad, den_id ); + }; + + do_xmat_vvar(DEN_S); + if (not is_rks) { + do_xmat_vvar(DEN_Z); + } // Evaluate U variables - if( func.is_gga() ) lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme ); - else lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme ); + if( func.is_mgga() ) lwd->eval_uvars_mgga( &device_data, enabled_terms.ks_scheme, need_lapl ); + else if( func.is_gga() ) lwd->eval_uvars_gga ( &device_data, enabled_terms.ks_scheme ); + else lwd->eval_uvars_lda ( &device_data, enabled_terms.ks_scheme ); // Evaluate XC functional (we need VXC for EXC Gradient) - if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data ); - else lwd->eval_kern_exc_vxc_lda( func, &device_data ); + if( func.is_mgga() ) lwd->eval_kern_exc_vxc_mgga( func, &device_data ); + else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga ( func, &device_data ); + else lwd->eval_kern_exc_vxc_lda ( func, &device_data ); - // Do scalar N_EL integration + + // Do scalar N_EL integration lwd->inc_nel( &device_data ); // Increment EXC Gradient - if( func.is_gga() ) lwd->inc_exc_grad_gga( &device_data ); - else lwd->inc_exc_grad_lda( &device_data ); + if( func.is_mgga() ) lwd->inc_exc_grad_mgga( &device_data, enabled_terms.ks_scheme, need_lapl, exc_grad_settings.include_weight_derivatives ); + else if( func.is_gga() ) lwd->inc_exc_grad_gga ( &device_data, enabled_terms.ks_scheme, exc_grad_settings.include_weight_derivatives ); + else lwd->inc_exc_grad_lda ( &device_data, enabled_terms.ks_scheme, exc_grad_settings.include_weight_derivatives ); + + // weight contribution + if(exc_grad_settings.include_weight_derivatives) + lwd->eval_weight_1st_deriv_contracted( &device_data, weight_alg ); } // Loop over batches of batches @@ -163,12 +269,14 @@ void IncoreReplicatedXCDeviceIntegrator:: template void IncoreReplicatedXCDeviceIntegrator:: eval_exc_grad_local_work_( const basis_type& basis, - const value_type* P, int64_t ldp, value_type* EXC_GRAD, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + value_type* EXC_GRAD, host_task_iterator task_begin, host_task_iterator task_end, - XCDeviceData& device_data ) { + XCDeviceData& device_data, const IntegratorSettingsXC& settings ) { // Compute XC gradient and keep data on the device - eval_exc_grad_local_work_( basis, P, ldp, task_begin, task_end, device_data ); + eval_exc_grad_local_work_( basis, Ps, ldps, Pz, ldpz, task_begin, task_end, device_data, settings ); // Receive XC gradient from host double N_EL; diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp index c8cae61d..416a49c5 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exc_vxc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -237,7 +241,7 @@ void IncoreReplicatedXCDeviceIntegrator:: const auto& func = *this->func_; const auto& mol = this->load_balancer_->molecule(); - if( func.is_mgga() and (is_uks or is_gks) ) GAUXC_GENERIC_EXCEPTION("Device + Polarized mGGAs NYI!"); + if( func.is_mgga() and is_gks ) GAUXC_GENERIC_EXCEPTION("GKS mGGAs NYI!"); // Get basis map BasisSetMap basis_map(basis,mol); @@ -310,12 +314,13 @@ void IncoreReplicatedXCDeviceIntegrator:: const double xmat_fac = is_rks ? 2.0 : 1.0; const bool need_xmat_grad = func.is_mgga(); - const bool need_vvar_grad = func.is_mgga() or func.is_gga(); // Evaluate X matrix and V vars auto do_xmat_vvar = [&](density_id den_id) { lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id ); - lwd->eval_vvar( &device_data, den_id, need_vvar_grad ); + if(func.is_lda()) lwd->eval_vvars_lda( &device_data, den_id ); + else if(func.is_gga()) lwd->eval_vvars_gga( &device_data, den_id ); + else lwd->eval_vvars_mgga( &device_data, den_id, need_lapl ); }; do_xmat_vvar(DEN_S); @@ -329,25 +334,25 @@ void IncoreReplicatedXCDeviceIntegrator:: // Evaluate U variables - if( func.is_mgga() ) lwd->eval_uvars_mgga( &device_data, need_lapl ); //<<< TODO: Fn call is different because MGGA U/GKS NYI - else if( func.is_gga() ) lwd->eval_uvars_gga( &device_data, enabled_terms.ks_scheme ); - else lwd->eval_uvars_lda( &device_data, enabled_terms.ks_scheme ); + if( func.is_mgga() ) lwd->eval_uvars_mgga( &device_data, enabled_terms.ks_scheme, need_lapl ); + else if( func.is_gga() ) lwd->eval_uvars_gga ( &device_data, enabled_terms.ks_scheme ); + else lwd->eval_uvars_lda ( &device_data, enabled_terms.ks_scheme ); // Evaluate XC functional if( func.is_mgga() ) lwd->eval_kern_exc_vxc_mgga( func, &device_data ); - else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga( func, &device_data ); - else lwd->eval_kern_exc_vxc_lda( func, &device_data ); + else if( func.is_gga() ) lwd->eval_kern_exc_vxc_gga ( func, &device_data ); + else lwd->eval_kern_exc_vxc_lda ( func, &device_data ); // Do scalar EXC/N_EL integrations lwd->inc_exc( &device_data ); lwd->inc_nel( &device_data ); - if( not do_vxc ) continue; + if( not do_vxc) continue; auto do_zmat_vxc = [&](density_id den_id) { if( func.is_mgga() ) { - lwd->eval_zmat_mgga_vxc( &device_data, need_lapl); - lwd->eval_mmat_mgga_vxc( &device_data, need_lapl); + lwd->eval_zmat_mgga_vxc( &device_data, enabled_terms.ks_scheme, need_lapl, den_id); + lwd->eval_mmat_mgga_vxc( &device_data, enabled_terms.ks_scheme, need_lapl, den_id); } else if( func.is_gga() ) lwd->eval_zmat_gga_vxc( &device_data, enabled_terms.ks_scheme, den_id ); diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp index 1aec0077..c19f5d5b 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_exx.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -130,8 +134,6 @@ void IncoreReplicatedXCDeviceIntegrator:: // Get basis map and shell pairs - //BasisSetMap basis_map(basis,mol); - //ShellPairCollection shell_pairs(basis); auto& basis_map = this->load_balancer_->basis_map(); auto& shell_pairs = this->load_balancer_->shell_pairs(); @@ -257,8 +259,8 @@ void IncoreReplicatedXCDeviceIntegrator:: GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); } - task_end = std::stable_partition( task_begin, task_end, - []( const auto& t ) { return t.cou_screening.shell_list.size() > 0; } ); + task_end = std::stable_partition( task_begin, task_end, + []( const auto& t ) { return t.cou_screening.shell_list.size() > 0; } ); #if 0 // Lexicographic ordering of tasks diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp new file mode 100644 index 00000000..ffc0ca41 --- /dev/null +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_fxc_contraction.hpp @@ -0,0 +1,343 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "incore_replicated_xc_device_integrator.hpp" +#include +#include + +namespace GauXC::detail { + + template + void IncoreReplicatedXCDeviceIntegrator:: + eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ) { + + eval_fxc_contraction_( m, n, P, ldp, nullptr, 0, tP, ldtp, nullptr, 0, + FXC, ldfxc, nullptr, 0, ks_settings ); + } + + + template + void IncoreReplicatedXCDeviceIntegrator:: + eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ) { + const bool is_uks = (Pz != nullptr); + const bool is_rks = !is_uks; + + const auto& basis = this->load_balancer_->basis(); + + // Check that P / FXC are sane + const int64_t nbf = basis.nbf(); + if( m != n ) + GAUXC_GENERIC_EXCEPTION("P/FXC Must Be Square"); + if( m != nbf ) + GAUXC_GENERIC_EXCEPTION("P/FXC Must Have Same Dimension as Basis"); + if( ldps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPs"); + if( ldtps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDTps"); + if( ldfxcs < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDFXCs"); + + if( not is_rks ) { + if( ldpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPz"); + if( ldtpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDTpz"); + if( ldfxcz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDFXCz"); + } + + // Get Tasks + auto& tasks = this->load_balancer_->get_tasks(); + + // Allocate Device memory + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + auto device_data_ptr = lwd->create_device_data(rt); + + GAUXC_MPI_CODE( MPI_Barrier(rt.comm());) + + // Temporary electron count to judge integrator accuracy + value_type N_EL; + + if( this->reduction_driver_->takes_device_memory() ) { + + // If we can do reductions on the device (e.g. NCCL) + // Don't communicate data back to the host before reduction + this->timer_.time_op("XCIntegrator.LocalWork_FXC", [&](){ + fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, tPs, ldtps, tPz, ldtpz, + tasks.begin(), tasks.end(), *device_data_ptr); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait_FXC",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + // Reduce results in device memory + double* fxc_s_device = device_data_ptr->fxc_s_device_data(); + double* fxc_z_device; + auto nel_device = device_data_ptr->nel_device_data(); + auto queue = device_data_ptr->queue(); + + if( not is_rks ) { + fxc_z_device = device_data_ptr->fxc_z_device_data(); + // UKS + this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){ + this->reduction_driver_->allreduce_inplace( fxc_s_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( fxc_z_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( nel_device, 1, ReductionOp::Sum, queue ); + }); + } else { + // RKS + this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){ + this->reduction_driver_->allreduce_inplace( fxc_s_device, nbf*nbf, ReductionOp::Sum, queue ); + this->reduction_driver_->allreduce_inplace( nel_device, 1, ReductionOp::Sum, queue ); + }); + } + + // Retrieve data to host + this->timer_.time_op("XCIntegrator.DeviceToHostCopy_FXC",[&](){ + device_data_ptr->retrieve_fxc_contraction_integrands(&N_EL, FXCs, ldfxcs, FXCz, ldfxcz, nullptr, 0, nullptr, 0); + }); + + } else { + + // Compute local contributions to FXC and retrieve + // data from device + this->timer_.time_op("XCIntegrator.LocalWork_FXC", [&](){ + fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, tPs, ldtps, tPz, ldtpz, &N_EL, + FXCs, ldfxcs, FXCz, ldfxcz, tasks.begin(), tasks.end(), *device_data_ptr); + }); + + GAUXC_MPI_CODE( + this->timer_.time_op("XCIntegrator.ImbalanceWait_FXC",[&](){ + MPI_Barrier(this->load_balancer_->runtime().comm()); + }); + ) + + // Reduce Results in host mem + if( is_rks ) { + this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){ + this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + } else { + // UKS + this->timer_.time_op("XCIntegrator.Allreduce_FXC", [&](){ + this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( FXCz, nbf*nbf, ReductionOp::Sum ); + this->reduction_driver_->allreduce_inplace( &N_EL, 1, ReductionOp::Sum ); + }); + } + } + } + + template + void IncoreReplicatedXCDeviceIntegrator:: + fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data) { + const bool is_uks = (Pz != nullptr); + const bool is_rks = !is_uks; + if (not is_rks and not is_uks) { + GAUXC_GENERIC_EXCEPTION("MUST BE EITHER RKS OR UKS!"); + } + + + // Cast LWD to LocalDeviceWorkDriver + auto* lwd = dynamic_cast(this->local_work_driver_.get() ); + + // Setup Aliases + const auto& func = *this->func_; + const auto& mol = this->load_balancer_->molecule(); + + // Get basis map + BasisSetMap basis_map(basis,mol); + + // Populate submat maps + device_data.populate_submat_maps( basis.nbf(), task_begin, task_end, basis_map ); + + + // Sort tasks + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + std::sort( task_begin, task_end, task_comparator ); + + + // Check that Partition Weights have been calculated + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + + + integrator_term_tracker enabled_terms; + enabled_terms.fxc_contraction = true; + + if (is_rks) enabled_terms.ks_scheme = RKS; + else if (is_uks) enabled_terms.ks_scheme = UKS; + + if( func.is_lda() ) + enabled_terms.xc_approx = integrator_xc_approx::LDA; + else if( func.is_gga() ) + enabled_terms.xc_approx = integrator_xc_approx::GGA; + else if( func.needs_laplacian() ) + GAUXC_GENERIC_EXCEPTION("FXC contraction does not support MGGA with Laplacian"); + else + enabled_terms.xc_approx = integrator_xc_approx::MGGA_TAU; + + // Do XC integration in task batches + const auto nbf = basis.nbf(); + const auto nshells = basis.nshells(); + device_data.reset_allocations(); + device_data.allocate_static_data_fxc_contraction( nbf, nshells, enabled_terms); + + device_data.send_static_data_density_basis( Ps, ldps, Pz, ldpz, nullptr, 0, nullptr, 0, basis ); + device_data.send_static_data_trial_density( tPs, ldtps, tPz, ldtpz, nullptr, 0, nullptr, 0 ); + + + // Zero integrands + device_data.zero_fxc_contraction_integrands(); + + + auto task_it = task_begin; + while( task_it != task_end ) { + + // Determine next task batch, send relevant data to device (FXC only) + task_it = + device_data.generate_buffers( enabled_terms, basis_map, task_it, task_end ); + + /*** Process the batches ***/ + + const bool need_lapl = func.needs_laplacian(); + // Evaluate collocation + if( func.is_mgga() ) { + if(need_lapl) lwd->eval_collocation_laplacian( &device_data ); + else lwd->eval_collocation_gradient( &device_data ); + } + else if( func.is_gga() ) lwd->eval_collocation_gradient( &device_data ); + else lwd->eval_collocation( &device_data ); + + const double xmat_fac = is_rks ? 2.0 : 1.0; + const bool need_xmat_grad = func.is_mgga(); + + // Evaluate X matrix and V vars + auto do_xmat_vvar = [&](density_id den_id) { + lwd->eval_xmat( xmat_fac, &device_data, need_xmat_grad, den_id ); + if(func.is_lda()) lwd->eval_vvars_lda( &device_data, den_id ); + else if(func.is_gga()) lwd->eval_vvars_gga( &device_data, den_id ); + else lwd->eval_vvars_mgga( &device_data, den_id, need_lapl ); + }; + + do_xmat_vvar(DEN_S); + if (not is_rks) { + do_xmat_vvar(DEN_Z); + } + + // Evaluate U variables + if( func.is_mgga() ) lwd->eval_uvars_mgga( &device_data, enabled_terms.ks_scheme, need_lapl ); + else if( func.is_gga() ) lwd->eval_uvars_gga ( &device_data, enabled_terms.ks_scheme ); + else lwd->eval_uvars_lda ( &device_data, enabled_terms.ks_scheme ); + + // Evaluate XC functional + if( func.is_mgga() ) lwd->eval_kern_vxc_fxc_mgga( func, &device_data ); + else if( func.is_gga() ) lwd->eval_kern_vxc_fxc_gga ( func, &device_data ); + else lwd->eval_kern_vxc_fxc_lda ( func, &device_data ); + + // Do scalar N_EL integrations + lwd->inc_nel( &device_data ); + + + // Evaluate X matrix and V vars from trial density + auto do_xmat_vvar_trial = [&](density_id den_id) { + lwd->eval_xmat_trial( xmat_fac, &device_data, need_xmat_grad, den_id ); + if(func.is_lda()) lwd->eval_vvars_lda_trial( &device_data, den_id ); + else if(func.is_gga()) lwd->eval_vvars_gga_trial( &device_data, den_id ); + else lwd->eval_vvars_mgga_trial( &device_data, den_id, need_lapl ); + }; + + do_xmat_vvar_trial(DEN_S); + if (not is_rks) { + do_xmat_vvar_trial(DEN_Z); + } + + // Evaluate tmat (it contains the trial u variable evaluation inside) + if( func.is_mgga() ) lwd->eval_tmat_mgga( &device_data, enabled_terms.ks_scheme, need_lapl ); + else if( func.is_gga() ) lwd->eval_tmat_gga ( &device_data, enabled_terms.ks_scheme ); + else lwd->eval_tmat_lda ( &device_data, enabled_terms.ks_scheme ); + + auto do_zmat_fxc = [&](density_id den_id) { + if( func.is_mgga() ) { + lwd->eval_zmat_mgga_fxc( &device_data, need_lapl, den_id); + lwd->eval_mmat_mgga_fxc( &device_data, need_lapl, den_id); + } + else if( func.is_gga() ) + lwd->eval_zmat_gga_fxc( &device_data, den_id ); + else + lwd->eval_zmat_lda_fxc( &device_data, den_id ); + lwd->inc_fxc( &device_data, den_id, func.is_mgga() ); + }; + + do_zmat_fxc(DEN_S); + if(not is_rks) { + do_zmat_fxc(DEN_Z); + } + + } // Loop over batches of batches + + // Symmetrize FXC in device memory + lwd->symmetrize_fxc( &device_data, DEN_S ); + if (not is_rks) { + lwd->symmetrize_fxc( &device_data, DEN_Z ); + } + } + + template + void IncoreReplicatedXCDeviceIntegrator:: + fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type *N_EL, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + host_task_iterator task_begin, host_task_iterator task_end, + XCDeviceData& device_data ) { + + // Get integrate and keep data on device + fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, tPs, ldtps, tPz, ldtpz, + task_begin, task_end, device_data); + auto rt = detail::as_device_runtime(this->load_balancer_->runtime()); + rt.device_backend()->master_queue_synchronize(); + + // Receive FXC terms from host + this->timer_.time_op("XCIntegrator.DeviceToHostCopy_FXC",[&](){ + device_data.retrieve_fxc_contraction_integrands( N_EL, FXCs, ldfxcs, FXCz, ldfxcz, nullptr, 0, nullptr, 0 ); + }); + } +} diff --git a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp index 764bf3c0..d7f224c1 100644 --- a/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp +++ b/src/xc_integrator/replicated/device/incore_replicated_xc_device_integrator_integrate_den.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -130,7 +134,7 @@ void IncoreReplicatedXCDeviceIntegrator:: // Evaluate the density const bool do_vvar_grad = false; - lwd->eval_vvar( &device_data, DEN_S, do_vvar_grad ); + lwd->eval_vvars_lda( &device_data, DEN_S ); // Do scalar N_EL integration lwd->inc_nel( &device_data ); diff --git a/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx b/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx index 605de1c6..082b74ea 100644 --- a/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx +++ b/src/xc_integrator/replicated/device/replicated_xc_device_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx index 1ff70ee9..febcd7aa 100644 --- a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx +++ b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -11,6 +15,9 @@ #include "shell_batched_replicated_xc_integrator_exc_vxc.hpp" #include "shell_batched_replicated_xc_integrator_exc_grad.hpp" #include "shell_batched_replicated_xc_integrator_exx.hpp" +#include "shell_batched_replicated_xc_integrator_fxc_contraction.hpp" +#include "shell_batched_replicated_xc_integrator_dd_psi.hpp" +#include "shell_batched_replicated_xc_integrator_dd_psi_potential.hpp" namespace GauXC { namespace detail { diff --git a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp index dd165f64..38c8efdc 100644 --- a/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp +++ b/src/xc_integrator/replicated/device/shell_batched_replicated_xc_device_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/CMakeLists.txt b/src/xc_integrator/replicated/host/CMakeLists.txt index ae47dc6d..2b878b68 100644 --- a/src/xc_integrator/replicated/host/CMakeLists.txt +++ b/src/xc_integrator/replicated/host/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx index 731eaf84..6695d912 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -10,7 +14,10 @@ #include "reference_replicated_xc_host_integrator_exc_vxc.hpp" #include "reference_replicated_xc_host_integrator_exc_grad.hpp" #include "reference_replicated_xc_host_integrator_exx.hpp" - +#include "reference_replicated_xc_host_integrator_fxc_contraction.hpp" +#include "reference_replicated_xc_host_integrator_dd_psi.hpp" +#include "reference_replicated_xc_host_integrator_dd_psi_potential.hpp" + namespace GauXC::detail { template diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp index bf5d4d61..a32748eb 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -72,15 +76,40 @@ class ReferenceReplicatedXCHostIntegrator : /// RKS EXC Gradient - void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) override; + void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, + value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override; + /// UKS EXC Gradient + void eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override; /// sn-LinK void eval_exx_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* K, int64_t ldk, const IntegratorSettingsEXX& settings ) override; - + /// RKS FXC contraction + void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ) override; + + // UKS FXC contraction + void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ) override; + + /// ddX PSi + void eval_dd_psi_( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) override; + + /// ddX PhiX + void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx ) override; // Implementation details of integrate_den void integrate_den_local_work_( const value_type* P, int64_t ldp, @@ -99,12 +128,28 @@ class ReferenceReplicatedXCHostIntegrator : task_iterator task_begin, task_iterator task_end ); // Implemetation details of exc_grad - void exc_grad_local_work_( const value_type* P, int64_t ldp, value_type* EXC_GRAD ); + void exc_grad_local_work_( const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz, + value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ); // Implementation details of sn-LinK void exx_local_work_( const value_type* P, int64_t ldp, value_type* K, int64_t ldk, const IntegratorSettingsEXX& settings ); + // Implementation details of UKS FXC contraction + void fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + value_type *N_EL, const IntegratorSettingsXC& ks_settings, + task_iterator task_begin, task_iterator task_end ); + + // Implementation details of ddX Psi + void dd_psi_local_work_( const value_type* P, int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ); + + void dd_psi_potential_local_work_( const value_type* X, value_type* Vddx, unsigned max_Ylm ); + public: template diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp new file mode 100644 index 00000000..4a6b1138 --- /dev/null +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi.hpp @@ -0,0 +1,176 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once + +#include "reference_replicated_xc_host_integrator.hpp" +#include "integrator_util/integrator_common.hpp" +#include "integrator_util/spherical_harmonics.hpp" +#include "host/local_host_work_driver.hpp" +#include +#include +#include + +namespace GauXC::detail { +template +void ReferenceReplicatedXCHostIntegrator:: + eval_dd_psi_( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, value_type* ddPsi, + int64_t ldPsi ) { + + const auto& basis = this->load_balancer_->basis(); + const auto& mol = this->load_balancer_->molecule(); + // Check that P / VXC are sane + const int64_t nbf = basis.nbf(); + if( m != n ) + GAUXC_GENERIC_EXCEPTION("P Must Be Square"); + if( m != nbf ) + GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis"); + if( ldp < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDP"); + + // Get Tasks + this->load_balancer_->get_tasks(); + // Compute Local contributions to ddPsi + this->timer_.time_op("XCIntegrator.LocalWork", [&](){ + dd_psi_local_work_( P, ldp, max_Ylm, ddPsi, ldPsi ); + }); + + + // Reduce Results + this->timer_.time_op("XCIntegrator.Allreduce", [&](){ + + if( not this->reduction_driver_->takes_host_memory() ) + GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions"); + + this->reduction_driver_->allreduce_inplace( ddPsi, ldPsi * mol.size(), ReductionOp::Sum ); + + }); +} + +template +void ReferenceReplicatedXCHostIntegrator:: + dd_psi_local_work_( const value_type* P, int64_t ldp, unsigned max_Ylm, + value_type* dd_Psi, int64_t ldPsi) { + + // Cast LWD to LocalHostWorkDriver + auto* lwd = dynamic_cast(this->local_work_driver_.get()); + + // Setup Aliases + const auto& basis = this->load_balancer_->basis(); + const auto& mol = this->load_balancer_->molecule(); + + // Atom-specific data + int natom = mol.size(); + std::vector radii(natom); + for (int i = 0; i < natom; ++i) { + radii[i] = uff_radius_103(mol[i].Z); + } + // Get basis map + BasisSetMap basis_map(basis,mol); + + const int32_t nbf = basis.nbf(); + // Sort tasks on size (XXX: maybe doesnt matter?) + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + + auto& tasks = this->load_balancer_->get_tasks(); + std::sort( tasks.begin(), tasks.end(), task_comparator ); + + + // Compute Partition Weights + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + + + // Loop over tasks + const size_t ntasks = tasks.size(); + #pragma omp parallel + { + + XCHostData host_data; // Thread local host data + + #pragma omp for schedule(dynamic) reduction(+:dd_Psi[:natom * ldPsi]) + for( size_t iT = 0; iT < ntasks; ++iT ) { + + // Alias current task + const auto& task = tasks[iT]; + + // Get tasks constants + const int32_t npts = task.points.size(); + const int32_t nbe = task.bfn_screening.nbe; + const int32_t nshells = task.bfn_screening.shell_list.size(); + + const auto* points = task.points.data()->data(); + const auto* weights = task.weights.data(); + const int32_t* shell_list = task.bfn_screening.shell_list.data(); + + // Allocate enough memory for batch + + host_data.nbe_scr .resize( nbe * nbe ); + host_data.zmat .resize( npts * nbe ); + + host_data.basis_eval .resize( npts * nbe ); + host_data.den_scr .resize( npts ); + + + // Alias/Partition out scratch memory + auto* basis_eval = host_data.basis_eval.data(); + auto* den_eval = host_data.den_scr.data(); + auto* nbe_scr = host_data.nbe_scr.data(); + auto* zmat = host_data.zmat.data(); + + int nharmonics = (max_Ylm + 1) * (max_Ylm + 1); + + // Get the submatrix map for batch + std::vector< std::array > submat_map; + std::tie(submat_map, std::ignore) = + gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf); + + // Evaluate Collocation + lwd->eval_collocation( npts, nshells, nbe, points, basis, shell_list, + basis_eval ); + + // Evaluate X matrix (P * B) -> store in Z + lwd->eval_xmat( npts, nbf, nbe, submat_map, 1.0, P, ldp, basis_eval, nbe, + zmat, nbe, nbe_scr ); + + // Evaluate density on grid + lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, den_eval ); + + // Populate dd_Psi + const size_t atom_offset = task.iParent * ldPsi; + const double radius = radii[task.iParent]; + const std::array center = {mol[task.iParent].x, mol[task.iParent].y, mol[task.iParent].z}; + + std::vector ylm_matrix(npts * nharmonics); + scaled_ylm_matrix(max_Ylm, points, npts, center, radius, ylm_matrix.data()); + + for (int i = 0; i < npts; ++i) { + den_eval[i] *= -weights[i]; + } + std::vector offset_local_dd_psi(ldPsi, 0.0); + blas::gemm('N', 'N', ldPsi, 1, npts, + 1.0, ylm_matrix.data(), ldPsi, + den_eval, npts, + 0.0, offset_local_dd_psi.data(), ldPsi); + for (int j = 0; j < ldPsi; ++j) { + dd_Psi[atom_offset + j] += offset_local_dd_psi[j]; + } + + } // Loop over tasks + } // End OpenMP region +} +} // namespace GauXC::detail + diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp new file mode 100644 index 00000000..fad35a29 --- /dev/null +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_dd_psi_potential.hpp @@ -0,0 +1,171 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once + +#include "reference_replicated_xc_host_integrator.hpp" +#include "integrator_util/integrator_common.hpp" +#include "integrator_util/spherical_harmonics.hpp" +#include "host/local_host_work_driver.hpp" +#include +#include +#include "host/blas.hpp" +#include "host/util.hpp" + +namespace GauXC::detail { +template +void ReferenceReplicatedXCHostIntegrator:: + eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, + value_type* Vddx ) { + + const auto& basis = this->load_balancer_->basis(); + const int32_t nbf = basis.nbf(); + + // Check that m is natom, n is nharmonics + const auto& mol = this->load_balancer_->molecule(); + const size_t natom = mol.size(); + const size_t nharmonics = (max_Ylm + 1) * (max_Ylm + 1); + if (m != nharmonics || n != natom) { + GAUXC_GENERIC_EXCEPTION("m must be nharmonics and n must be natom"); + } + // Get Tasks + this->load_balancer_->get_tasks(); + // Compute Local contributions to EXC / VXC + this->timer_.time_op("XCIntegrator.LocalWork", [&](){ + dd_psi_potential_local_work_( X, Vddx, max_Ylm ); + }); + + // Reduce Results + this->timer_.time_op("XCIntegrator.Allreduce", [&](){ + + if( not this->reduction_driver_->takes_host_memory() ) + GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions"); + + this->reduction_driver_->allreduce_inplace( Vddx, nbf * nbf, ReductionOp::Sum ); + + }); +} + +template +void ReferenceReplicatedXCHostIntegrator:: + dd_psi_potential_local_work_( const value_type* X, value_type* Vddx, unsigned max_Ylm ) { + + // Cast LWD to LocalHostWorkDriver + auto* lwd = dynamic_cast(this->local_work_driver_.get()); + + // Setup Aliases + const auto& basis = this->load_balancer_->basis(); + const auto& mol = this->load_balancer_->molecule(); + + // Atom-specific data + std::vector radii(mol.size()); + for (int i = 0; i < mol.size(); ++i) { + radii[i] = uff_radius_103(mol[i].Z); + } + + // Get basis map + BasisSetMap basis_map(basis,mol); + + const int32_t nbf = basis.nbf(); + // Sort tasks on size (XXX: maybe doesnt matter?) + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + + auto& tasks = this->load_balancer_->get_tasks(); + std::sort( tasks.begin(), tasks.end(), task_comparator ); + + // Compute Partition Weights + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + + // Loop over tasks + const size_t ntasks = tasks.size(); + + #pragma omp parallel + { + + XCHostData host_data; // Thread local host data + + #pragma omp for schedule(dynamic) + for( size_t iT = 0; iT < ntasks; ++iT ) { + + // Alias current task + const auto& task = tasks[iT]; + + // Get tasks constants + const int32_t npts = task.points.size(); + const int32_t nbe = task.bfn_screening.nbe; + const int32_t nshells = task.bfn_screening.shell_list.size(); + + const auto* points = task.points.data()->data(); + const auto* weights = task.weights.data(); + const int32_t* shell_list = task.bfn_screening.shell_list.data(); + + // Allocate enough memory for batch + host_data.basis_eval .resize( npts * nbe ); + auto* basis_eval = host_data.basis_eval.data(); + + host_data.nbe_scr .resize( nbe * nbe ); + auto* vddx_scr = host_data.nbe_scr.data(); + + host_data.den_scr .resize( npts ); + auto etas = host_data.den_scr.data(); + + host_data.zmat .resize( npts * nbe ); + auto* zmat = host_data.zmat.data(); + + int nharmonics = (max_Ylm + 1) * (max_Ylm + 1); + + // Get the submatrix map for batch + std::vector< std::array > submat_map; + std::tie(submat_map, std::ignore) = + gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf); + + // Evaluate Collocation + lwd->eval_collocation( npts, nshells, nbe, points, basis, shell_list, + basis_eval ); + + // Project X onto the spherical harmonics basis + const size_t atom_offset = task.iParent * nharmonics; + const double radius = radii[task.iParent]; + std::array center = {mol[task.iParent].x, mol[task.iParent].y, mol[task.iParent].z}; + const value_type* X_i = X + atom_offset; + + std::vector ylm_matrix(npts * nharmonics); + scaled_ylm_matrix(max_Ylm, points, npts, center, radius, ylm_matrix.data()); + + blas::gemm('T', 'N', npts, 1, nharmonics, + 1.0, ylm_matrix.data(), nharmonics, + X_i, nharmonics, + 0.0, etas, npts); + + // zmat = phi * etas + for (int ipt = 0; ipt < npts; ipt++) { + etas[ipt] *= weights[ipt]; + for (int ibe = 0; ibe < nbe; ibe++) { + zmat[ipt * nbe + ibe] = basis_eval[ipt * nbe + ibe] * etas[ipt]; // nbe is fastest, col in column-major + } + } + + // vddx_scr = phi^T * etas * weights * phi + blas::gemm('N', 'T', nbe, nbe, npts, 1.0, basis_eval, nbe, zmat, nbe, 0.0, vddx_scr, nbe); + + detail::inc_by_submat_atomic( nbf, nbf, nbe, nbe, Vddx, nbf, vddx_scr, nbe, + submat_map ); + } // Loop over tasks + } // End OpenMP region +} + +} // namespace GauXC::detail + diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp index da6c3264..de1cb9ef 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp index d8472c13..f04ae24b 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_grad.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -18,7 +22,7 @@ namespace GauXC::detail { template void ReferenceReplicatedXCHostIntegrator:: eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) { + int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { const auto& basis = this->load_balancer_->basis(); @@ -38,7 +42,49 @@ void ReferenceReplicatedXCHostIntegrator:: // Compute Local contributions to EXC / VXC this->timer_.time_op("XCIntegrator.LocalWork", [&](){ - exc_grad_local_work_( P, ldp, EXC_GRAD ); + exc_grad_local_work_( P, ldp, nullptr, 0, EXC_GRAD, ks_settings ); + }); + + + // Reduce Results + this->timer_.time_op("XCIntegrator.Allreduce", [&](){ + + if( not this->reduction_driver_->takes_host_memory() ) + GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions"); + + const int natoms = this->load_balancer_->molecule().natoms(); + this->reduction_driver_->allreduce_inplace( EXC_GRAD, 3*natoms, ReductionOp::Sum ); + }); + +} + + +template +void ReferenceReplicatedXCHostIntegrator:: + eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { + + + const auto& basis = this->load_balancer_->basis(); + + // Check that P is sane + const int64_t nbf = basis.nbf(); + if( m != n ) + GAUXC_GENERIC_EXCEPTION("P Must Be Square"); + if( m != nbf ) + GAUXC_GENERIC_EXCEPTION("P Must Have Same Dimension as Basis"); + if( ldps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPS"); + if( ldpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPZ"); + + + // Get Tasks + this->load_balancer_->get_tasks(); + + // Compute Local contributions to EXC / VXC + this->timer_.time_op("XCIntegrator.LocalWork", [&](){ + exc_grad_local_work_( Ps, ldps, Pz, ldpz, EXC_GRAD, ks_settings ); }); @@ -56,7 +102,10 @@ void ReferenceReplicatedXCHostIntegrator:: template void ReferenceReplicatedXCHostIntegrator:: - exc_grad_local_work_( const value_type* P, int64_t ldp, value_type* EXC_GRAD ) { + exc_grad_local_work_( const value_type* Ps, int64_t ldps, const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { + + const bool is_uks = Pz != nullptr; + const bool is_rks = not is_uks; // Cast LWD to LocalHostWorkDriver auto* lwd = dynamic_cast(this->local_work_driver_.get()); @@ -65,11 +114,20 @@ void ReferenceReplicatedXCHostIntegrator:: const auto& func = *this->func_; const auto& basis = this->load_balancer_->basis(); const auto& mol = this->load_balancer_->molecule(); + const auto& molmeta = this->load_balancer_->molmeta(); // MGGA constants - const size_t mmga_dim_scal = func.is_mgga() ? 4 : 1; - const bool needs_laplacian = func.is_mgga() ? true : false; // TODO: Check for Laplacian dependence - // + const bool needs_laplacian = func.needs_laplacian(); + if(needs_laplacian and is_uks) { + GAUXC_GENERIC_EXCEPTION("UKS Gradients + Laplacian Dependent MGGAs is Not Yet Implemented"); + } + + // Misc KS settings + IntegratorSettingsEXC_GRAD exc_grad_settings; + if( auto* tmp = dynamic_cast(&settings) ) { + exc_grad_settings = *tmp; + } + // Get basis map BasisSetMap basis_map(basis,mol); @@ -90,6 +148,7 @@ void ReferenceReplicatedXCHostIntegrator:: if( not lb_state.modified_weights_are_stored ) { GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); } + XCWeightAlg& weight_alg = lb_state.weight_alg; // Zero out integrands for( auto i = 0; i < 3*natoms; ++i ) { @@ -107,12 +166,14 @@ void ReferenceReplicatedXCHostIntegrator:: for( size_t iT = 0; iT < ntasks; ++iT ) { // Alias current task - const auto& task = tasks[iT]; + auto& task = tasks[iT]; // Get tasks constants const int32_t npts = task.points.size(); const int32_t nbe = task.bfn_screening.nbe; const int32_t nshells = task.bfn_screening.shell_list.size(); + const size_t spin_dim_scal = is_rks ? 1 : 2; // last case is_uks + const size_t gga_dim_scal = is_rks ? 1 : 3; const auto* points = task.points.data()->data(); const auto* weights = task.weights.data(); @@ -123,69 +184,75 @@ void ReferenceReplicatedXCHostIntegrator:: // Things that every calc needs host_data.nbe_scr .resize( nbe * nbe ); host_data.eps .resize( npts ); - host_data.vrho .resize( npts ); - host_data.den_scr .resize( 4 * npts ); + host_data.vrho .resize( spin_dim_scal * npts ); + host_data.den_scr .resize( 4 * spin_dim_scal * npts ); if( func.is_lda() ) { host_data.basis_eval .resize( 4 * npts * nbe ); - host_data.zmat .resize( npts * nbe ); + host_data.zmat .resize( spin_dim_scal * npts * nbe ); } - if( func.is_gga() ){ + if( func.is_gga() or func.is_mgga() ) { host_data.basis_eval .resize( 10 * npts * nbe ); - host_data.zmat .resize( 4 * npts * nbe ); - host_data.gamma .resize( npts ); - host_data.vgamma .resize( npts ); + host_data.zmat .resize( 4 * spin_dim_scal * npts * nbe ); + host_data.gamma .resize( gga_dim_scal * npts ); + host_data.vgamma .resize( gga_dim_scal * npts ); } -#if 0 if( func.is_mgga() ) { - host_data.basis_eval .resize( 11 * npts * nbe ); // basis + grad(3) + hess(6) + lapl - host_data.zmat .resize( 7 * npts * nbe ); // basis + grad(3) + grad(3) - host_data.mmat .resize( npts * nbe ); - host_data.gamma .resize( npts ); - host_data.vgamma .resize( npts ); - host_data.tau .resize( npts ); - host_data.vtau .resize( npts ); + host_data.tau .resize( spin_dim_scal * npts ); + host_data.vtau.resize( spin_dim_scal * npts ); if ( needs_laplacian ) { - host_data.basis_eval.resize( 24 * npts * nbe ); - host_data.lapl .resize( npts ); - host_data.vlapl .resize( npts ); + host_data.basis_eval.resize( 24 * npts * nbe ); // 11 + lapl_grad(3) + der3(10) + host_data.lapl .resize( spin_dim_scal * npts ); + host_data.vlapl.resize( spin_dim_scal * npts ); } } -#endif // Alias/Partition out scratch memory auto* basis_eval = host_data.basis_eval.data(); auto* den_eval = host_data.den_scr.data(); auto* nbe_scr = host_data.nbe_scr.data(); - auto* zmat = host_data.zmat.data(); - auto* zmat_x = zmat + npts*nbe; - auto* zmat_y = zmat_x + npts*nbe; - auto* zmat_z = zmat_y + npts*nbe; + double* xNmat = nullptr; + double* xNmat_x = nullptr; + double* xNmat_y = nullptr; + double* xNmat_z = nullptr; + double* xZmat = nullptr; + double* xZmat_x = nullptr; + double* xZmat_y = nullptr; + double* xZmat_z = nullptr; auto* eps = host_data.eps.data(); auto* gamma = host_data.gamma.data(); auto* vrho = host_data.vrho.data(); auto* vgamma = host_data.vgamma.data(); -#if 0 auto* tau = host_data.tau.data(); auto* lapl = host_data.lapl.data(); auto* vtau = host_data.vtau.data(); auto* vlapl = host_data.vlapl.data(); - auto* mmat_x = mmat; - auto* mmat_y = mmat_x + npts * nbe; - auto* mmat_z = mmat_y + npts * nbe; -#endif auto* dbasis_x_eval = basis_eval + npts * nbe; auto* dbasis_y_eval = dbasis_x_eval + npts * nbe; auto* dbasis_z_eval = dbasis_y_eval + npts * nbe; - auto* dden_x_eval = den_eval + npts; - auto* dden_y_eval = dden_x_eval + npts; - auto* dden_z_eval = dden_y_eval + npts; + auto* dden_x_eval = den_eval + spin_dim_scal * npts; + auto* dden_y_eval = dden_x_eval + spin_dim_scal * npts; + auto* dden_z_eval = dden_y_eval + spin_dim_scal * npts; + + + xNmat = host_data.zmat.data(); + if(func.is_lda()) { + xZmat = xNmat + npts*nbe; + } else { + xNmat_x = xNmat + npts*nbe; + xNmat_y = xNmat_x + npts*nbe; + xNmat_z = xNmat_y + npts*nbe; + xZmat = xNmat_z + npts*nbe; + xZmat_x = xZmat + npts*nbe; + xZmat_y = xZmat_x + npts*nbe; + xZmat_z = xZmat_y + npts*nbe; + } value_type* d2basis_xx_eval = nullptr; value_type* d2basis_xy_eval = nullptr; @@ -193,24 +260,23 @@ void ReferenceReplicatedXCHostIntegrator:: value_type* d2basis_yy_eval = nullptr; value_type* d2basis_yz_eval = nullptr; value_type* d2basis_zz_eval = nullptr; -#if 0 - value_type* lbasis_eval = nullptr; - value_type* d3basis_xxx_eval = nullptr; - value_type* d3basis_xxy_eval = nullptr; - value_type* d3basis_xxz_eval = nullptr; - value_type* d3basis_xyy_eval = nullptr; - value_type* d3basis_xyz_eval = nullptr; - value_type* d3basis_xzz_eval = nullptr; - value_type* d3basis_yyy_eval = nullptr; - value_type* d3basis_yyz_eval = nullptr; - value_type* d3basis_yzz_eval = nullptr; - value_type* d3basis_zzz_eval = nullptr; - value_type* dlbasis_x_eval = nullptr; - value_type* dlbasis_y_eval = nullptr; - value_type* dlbasis_z_eval = nullptr; -#endif - - if( func.is_gga() ) { + + value_type* lbasis_eval = nullptr; + value_type* d3basis_xxx_eval = nullptr; + value_type* d3basis_xxy_eval = nullptr; + value_type* d3basis_xxz_eval = nullptr; + value_type* d3basis_xyy_eval = nullptr; + value_type* d3basis_xyz_eval = nullptr; + value_type* d3basis_xzz_eval = nullptr; + value_type* d3basis_yyy_eval = nullptr; + value_type* d3basis_yyz_eval = nullptr; + value_type* d3basis_yzz_eval = nullptr; + value_type* d3basis_zzz_eval = nullptr; + value_type* dlgradbasis_x_eval = nullptr; + value_type* dlgradbasis_y_eval = nullptr; + value_type* dlgradbasis_z_eval = nullptr; + + if( func.is_gga() or func.is_mgga() ) { d2basis_xx_eval = dbasis_z_eval + npts * nbe; d2basis_xy_eval = d2basis_xx_eval + npts * nbe; d2basis_xz_eval = d2basis_xy_eval + npts * nbe; @@ -219,32 +285,24 @@ void ReferenceReplicatedXCHostIntegrator:: d2basis_zz_eval = d2basis_yz_eval + npts * nbe; } -#if 0 - if( func.is_mgga() ) { - d2basis_xx_eval = dbasis_z_eval + npts * nbe; - d2basis_xy_eval = d2basis_xx_eval + npts * nbe; - d2basis_xz_eval = d2basis_xy_eval + npts * nbe; - d2basis_yy_eval = d2basis_xz_eval + npts * nbe; - d2basis_yz_eval = d2basis_yy_eval + npts * nbe; - d2basis_zz_eval = d2basis_yz_eval + npts * nbe; - if ( true ) { - lbasis_eval = d2basis_zz_eval + npts * nbe; - d3basis_xxx_eval = lbasis_eval + npts * nbe; - d3basis_xxy_eval = d3basis_xxx_eval + npts * nbe; - d3basis_xxz_eval = d3basis_xxy_eval + npts * nbe; - d3basis_xyy_eval = d3basis_xxz_eval + npts * nbe; - d3basis_xyz_eval = d3basis_xyy_eval + npts * nbe; - d3basis_xzz_eval = d3basis_xyz_eval + npts * nbe; - d3basis_yyy_eval = d3basis_xzz_eval + npts * nbe; - d3basis_yyz_eval = d3basis_yyy_eval + npts * nbe; - d3basis_yzz_eval = d3basis_yyz_eval + npts * nbe; - d3basis_zzz_eval = d3basis_yzz_eval + npts * nbe; - dlbasis_x_eval = d3basis_zzz_eval + npts * nbe; - dlbasis_y_eval = dlbasis_x_eval + npts * nbe; - dlbasis_z_eval = dlbasis_y_eval + npts * nbe; - } + if( needs_laplacian ) { + lbasis_eval = d2basis_zz_eval + npts * nbe; + // TODO - this should not be needed once Gau2Grid + // can evaluate the laplacian gradients directly. + d3basis_xxx_eval = lbasis_eval + npts * nbe; + d3basis_xxy_eval = d3basis_xxx_eval + npts * nbe; + d3basis_xxz_eval = d3basis_xxy_eval + npts * nbe; + d3basis_xyy_eval = d3basis_xxz_eval + npts * nbe; + d3basis_xyz_eval = d3basis_xyy_eval + npts * nbe; + d3basis_xzz_eval = d3basis_xyz_eval + npts * nbe; + d3basis_yyy_eval = d3basis_xzz_eval + npts * nbe; + d3basis_yyz_eval = d3basis_yyy_eval + npts * nbe; + d3basis_yzz_eval = d3basis_yyz_eval + npts * nbe; + d3basis_zzz_eval = d3basis_yzz_eval + npts * nbe; + dlgradbasis_x_eval = d3basis_zzz_eval + npts * nbe; + dlgradbasis_y_eval = dlgradbasis_x_eval + npts * nbe; + dlgradbasis_z_eval = dlgradbasis_y_eval + npts * nbe; } -#endif // Get the submatrix map for batch @@ -252,84 +310,97 @@ void ReferenceReplicatedXCHostIntegrator:: gen_compressed_submat_map( basis_map, task.bfn_screening.shell_list, nbf, nbf ); // Evaluate Collocation Gradient (+ Hessian) -#if 0 - if( func.is_mgga() ) { + if( needs_laplacian ) { lwd->eval_collocation_der3( npts, nshells, nbe, points, basis, shell_list, basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, d2basis_xx_eval, d2basis_xy_eval, d2basis_xz_eval, d2basis_yy_eval, d2basis_yz_eval, d2basis_zz_eval, d3basis_xxx_eval, d3basis_xxy_eval, d3basis_xxz_eval, d3basis_xyy_eval, d3basis_xyz_eval, d3basis_xzz_eval, d3basis_yyy_eval, d3basis_yyz_eval, d3basis_yzz_eval, d3basis_zzz_eval); - - } - else if( func.is_gga() ) -#endif - if( func.is_gga() ) + } else if( func.is_gga() or func.is_mgga() ) { lwd->eval_collocation_hessian( npts, nshells, nbe, points, basis, shell_list, basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, d2basis_xx_eval, d2basis_xy_eval, d2basis_xz_eval, d2basis_yy_eval, d2basis_yz_eval, d2basis_zz_eval ); - else + } else { lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval ); + } // Evaluate X matrix (2 * P * B/Bx/By/Bz) -> store in Z // XXX: This assumes that bfn + gradients are contiguous in memory - if( func.is_gga() or func.is_mgga() ) { - lwd->eval_xmat( 4*npts, nbf, nbe, submat_map, 2.0, P, ldp, basis_eval, nbe, - zmat, nbe, nbe_scr ); - } else { - lwd->eval_xmat( npts, nbf, nbe, submat_map, 2.0, P, ldp, basis_eval, nbe, - zmat, nbe, nbe_scr ); + const auto xmat_fac = is_rks ? 2.0 : 1.0; + const int xmat_len = func.is_lda() ? 1 : 4; + lwd->eval_xmat( xmat_len*npts, nbf, nbe, submat_map, xmat_fac, Ps, ldps, basis_eval, nbe, + xNmat, nbe, nbe_scr ); + if(is_uks) { + lwd->eval_xmat( xmat_len*npts, nbf, nbe, submat_map, xmat_fac, Pz, ldpz, basis_eval, nbe, + xZmat, nbe, nbe_scr ); } // Evaluate U and V variables -#if 0 if( func.is_mgga() ) { if ( needs_laplacian ) { blas::lacpy( 'A', nbe, npts, d2basis_xx_eval, nbe, lbasis_eval, nbe ); blas::axpy( nbe * npts, 1., d2basis_yy_eval, 1, lbasis_eval, 1); blas::axpy( nbe * npts, 1., d2basis_zz_eval, 1, lbasis_eval, 1); - blas::lacpy( 'A', nbe, npts, d3basis_xxx_eval, nbe, dlbasis_x_eval, nbe ); - blas::axpy( nbe * npts, 1., d3basis_xyy_eval, 1, dlbasis_x_eval, 1); - blas::axpy( nbe * npts, 1., d3basis_xzz_eval, 1, dlbasis_x_eval, 1); + // TODO - this should be done directly in Gau2Grid + blas::lacpy( 'A', nbe, npts, d3basis_xxx_eval, nbe, dlgradbasis_x_eval, nbe ); + blas::axpy( nbe * npts, 1., d3basis_xyy_eval, 1, dlgradbasis_x_eval, 1); + blas::axpy( nbe * npts, 1., d3basis_xzz_eval, 1, dlgradbasis_x_eval, 1); - blas::lacpy( 'A', nbe, npts, d3basis_xxy_eval, nbe, dlbasis_y_eval, nbe ); - blas::axpy( nbe * npts, 1., d3basis_yyy_eval, 1, dlbasis_y_eval, 1); - blas::axpy( nbe * npts, 1., d3basis_yzz_eval, 1, dlbasis_y_eval, 1); + blas::lacpy( 'A', nbe, npts, d3basis_xxy_eval, nbe, dlgradbasis_y_eval, nbe ); + blas::axpy( nbe * npts, 1., d3basis_yyy_eval, 1, dlgradbasis_y_eval, 1); + blas::axpy( nbe * npts, 1., d3basis_yzz_eval, 1, dlgradbasis_y_eval, 1); - blas::lacpy( 'A', nbe, npts, d3basis_xxz_eval, nbe, dlbasis_z_eval, nbe ); - blas::axpy( nbe * npts, 1., d3basis_yyz_eval, 1, dlbasis_z_eval, 1); - blas::axpy( nbe * npts, 1., d3basis_zzz_eval, 1, dlbasis_z_eval, 1); + blas::lacpy( 'A', nbe, npts, d3basis_xxz_eval, nbe, dlgradbasis_z_eval, nbe ); + blas::axpy( nbe * npts, 1., d3basis_yyz_eval, 1, dlgradbasis_z_eval, 1); + blas::axpy( nbe * npts, 1., d3basis_zzz_eval, 1, dlgradbasis_z_eval, 1); } - lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, nbe, - den_eval, dden_x_eval, dden_y_eval, dden_z_eval, - gamma, tau, lapl ); + if(is_rks) + lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, xNmat, nbe, xNmat_x, xNmat_y, xNmat_z, nbe, + den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl ); + else + lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, xNmat, nbe, xZmat, nbe, xNmat_x, xNmat_y, xNmat_z, nbe, + xZmat_x, xZmat_y, xZmat_z, nbe, + den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl ); + } else if( func.is_gga() ) { + if(is_rks) + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, xNmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, + gamma ); + else + lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, xNmat, nbe, xZmat, nbe, den_eval, dden_x_eval, dden_y_eval, + dden_z_eval, gamma ); + } else { + if(is_rks) lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, xNmat, nbe, den_eval ); + else lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, xNmat, nbe, xZmat, nbe, den_eval ); } - else if( func.is_gga() ) -#endif - if( func.is_gga() ) - lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, - dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, - gamma ); - else - lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, den_eval ); - + // Evaluate XC functional -#if 0 if( func.is_mgga() ) func.eval_exc_vxc( npts, den_eval, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau ); else if(func.is_gga() ) -#endif - if( func.is_gga() ) func.eval_exc_vxc( npts, den_eval, gamma, eps, vrho, vgamma ); else func.eval_exc_vxc( npts, den_eval, eps, vrho ); + if(exc_grad_settings.include_weight_derivatives){ + // grid weight contribution to exc grad + for( int ipt = 0; ipt < npts; ++ipt ) { + const auto den = is_rks ? den_eval[ipt] : (den_eval[2*ipt] + den_eval[2*ipt+1]); + eps[ipt] *= den * weights[ipt]; + } + lwd->eval_weight_1st_deriv_contracted( weight_alg, mol, molmeta, + task, eps, EXC_GRAD); + } + // Increment EXC Gradient size_t bf_off = 0; @@ -337,97 +408,169 @@ void ReferenceReplicatedXCHostIntegrator:: const int sh_idx = shell_list[ish]; const int sh_sz = basis[sh_idx].size(); const int iAt = basis_map.shell_to_center( sh_idx ); + if(iAt == task.iParent and exc_grad_settings.include_weight_derivatives) { + bf_off += sh_sz; // Increment basis offset + continue; + } double g_acc_x(0), g_acc_y(0), g_acc_z(0); for( int ibf = 0, mu = bf_off; ibf < sh_sz; ++ibf, ++mu ) for( int ipt = 0; ipt < npts; ++ipt ) { - const int32_t mu_i = mu + ipt*nbe; - - // LDA Contributions - const double vrho_ipt = weights[ipt] * vrho[ipt]; - - const double z = zmat[mu_i]; // Z = N * B - - const double dbx = dbasis_x_eval[mu_i]; // B_x - const double dby = dbasis_y_eval[mu_i]; // B_y - const double dbz = dbasis_z_eval[mu_i]; // B_z - - g_acc_x += vrho_ipt * z * dbx; - g_acc_y += vrho_ipt * z * dby; - g_acc_z += vrho_ipt * z * dbz; - - if( func.is_gga() or func.is_mgga() ) { - // GGA Contributions - const double vgamma_ipt = weights[ipt] * vgamma[ipt]; - - const double dden_x = dden_x_eval[ipt]; - const double dden_y = dden_y_eval[ipt]; - const double dden_z = dden_z_eval[ipt]; - - const double zx = zmat_x[mu_i]; // Z_x = N * B_x - const double zy = zmat_y[mu_i]; // Z_y = N * B_y - const double zz = zmat_z[mu_i]; // Z_z = N * B_z - - const double d2bxx = d2basis_xx_eval[mu_i]; // B^2_xx - const double d2bxy = d2basis_xy_eval[mu_i]; // B^2_xy - const double d2bxz = d2basis_xz_eval[mu_i]; // B^2_xz - const double d2byy = d2basis_yy_eval[mu_i]; // B^2_yy - const double d2byz = d2basis_yz_eval[mu_i]; // B^2_yz - const double d2bzz = d2basis_zz_eval[mu_i]; // B^2_zz - - // sum_j B^2_{ij} * d_j n - double d2_term_x = d2bxx * dden_x + d2bxy * dden_y + d2bxz * dden_z; - double d2_term_y = d2bxy * dden_x + d2byy * dden_y + d2byz * dden_z; - double d2_term_z = d2bxz * dden_x + d2byz * dden_y + d2bzz * dden_z; - - // sum_j (d_j n) * Z^j - double d11_zmat_term = dden_x * zx + dden_y * zy + dden_z * zz; - - g_acc_x += 2 * vgamma_ipt * ( z * d2_term_x + dbx * d11_zmat_term ); - g_acc_y += 2 * vgamma_ipt * ( z * d2_term_y + dby * d11_zmat_term ); - g_acc_z += 2 * vgamma_ipt * ( z * d2_term_z + dbz * d11_zmat_term ); - } -#if 0 - if( func.is_mgga() ) { - - const double vtau_ipt = 0.5 * weights[ipt] * vtau[ipt]; - const double zx = zmat_x[mu_i]; // Z_x = N * B_x - const double zy = zmat_y[mu_i]; // Z_y = N * B_y - const double zz = zmat_z[mu_i]; // Z_z = N * B_z - const double d2bxx = d2basis_xx_eval[mu_i]; // B^2_xx - const double d2bxy = d2basis_xy_eval[mu_i]; // B^2_xy - const double d2bxz = d2basis_xz_eval[mu_i]; // B^2_xz - const double d2byy = d2basis_yy_eval[mu_i]; // B^2_yy - const double d2byz = d2basis_yz_eval[mu_i]; // B^2_yz - const double d2bzz = d2basis_zz_eval[mu_i]; // B^2_zz - double d2_term_x = d2bxx * zx + d2bxy * zy + d2bxz * zz; - double d2_term_y = d2bxy * zx + d2byy * zy + d2byz * zz; - double d2_term_z = d2bxz * zx + d2byz * zy + d2bzz * zz; - - g_acc_x += vtau_ipt * d2_term_x; - g_acc_y += vtau_ipt * d2_term_y; - g_acc_z += vtau_ipt * d2_term_z; - - if ( needs_laplacian ) { - const double vlapl_ipt = weights[ipt] * vlapl[ipt]; - const double lbf = lbasis_eval[mu_i]; - const double dlbx = dlbasis_x_eval[mu_i]; - const double dlby = dlbasis_y_eval[mu_i]; - const double dlbz = dlbasis_z_eval[mu_i]; - d2_term_x = z * dlbx + zx * lbf + 2.0*d2_term_x; - d2_term_y = z * dlby + zy * lbf + 2.0*d2_term_y; - d2_term_z = z * dlbz + zz * lbf + 2.0*d2_term_z; - - g_acc_x += vlapl_ipt * d2_term_x; - g_acc_y += vlapl_ipt * d2_term_y; - g_acc_z += vlapl_ipt * d2_term_z; - - } - - } -#endif - + const int32_t mu_i = mu + ipt*nbe; + + // LDA Contributions + // vrhop is actually vrhon for RKS + const double vrhop_ipt = weights[ipt] * vrho[spin_dim_scal * ipt]; + const double vrhom_ipt = is_uks ? weights[ipt] * vrho[spin_dim_scal * ipt + 1] : 0.0; + + const double xN = xNmat[mu_i]; // X = N * B + const double xZ = is_uks ? xZmat[mu_i] : 0.0; + + const double dbx = dbasis_x_eval[mu_i]; // B_x + const double dby = dbasis_y_eval[mu_i]; // B_y + const double dbz = dbasis_z_eval[mu_i]; // B_z + + if(is_rks) { + g_acc_x += vrhop_ipt * xN * dbx; + g_acc_y += vrhop_ipt * xN * dby; + g_acc_z += vrhop_ipt * xN * dbz; + } else { + const auto vrhon_ipt = vrhop_ipt + vrhom_ipt; + const auto vrhoz_ipt = vrhop_ipt - vrhom_ipt; + g_acc_x += 0.5 * vrhon_ipt * xN * dbx; + g_acc_y += 0.5 * vrhon_ipt * xN * dby; + g_acc_z += 0.5 * vrhon_ipt * xN * dbz; + + g_acc_x += 0.5 * vrhoz_ipt * xZ * dbx; + g_acc_y += 0.5 * vrhoz_ipt * xZ * dby; + g_acc_z += 0.5 * vrhoz_ipt * xZ * dbz; + } + + + if( func.is_gga() or func.is_mgga() ) { + // GGA Contributions + const double vgammapp_ipt = weights[ipt] * vgamma[gga_dim_scal * ipt + 0]; + const double vgammapm_ipt = is_uks ? weights[ipt] * vgamma[gga_dim_scal * ipt + 1] : 0.0; + const double vgammamm_ipt = is_uks ? weights[ipt] * vgamma[gga_dim_scal * ipt + 2] : 0.0; + + const double ddenn_x = dden_x_eval[spin_dim_scal * ipt]; + const double ddenn_y = dden_y_eval[spin_dim_scal * ipt]; + const double ddenn_z = dden_z_eval[spin_dim_scal * ipt]; + const double ddenz_x = is_uks ? dden_x_eval[spin_dim_scal * ipt + 1] : 0.0; + const double ddenz_y = is_uks ? dden_y_eval[spin_dim_scal * ipt + 1] : 0.0; + const double ddenz_z = is_uks ? dden_z_eval[spin_dim_scal * ipt + 1] : 0.0; + + const double xNx = xNmat_x[mu_i]; // XN_x = N * B_x + const double xNy = xNmat_y[mu_i]; // XN_y = N * B_y + const double xNz = xNmat_z[mu_i]; // XN_z = N * B_z + + const double xZx = is_uks ? xZmat_x[mu_i] : 0.0; + const double xZy = is_uks ? xZmat_y[mu_i] : 0.0; + const double xZz = is_uks ? xZmat_z[mu_i] : 0.0; + + const double d2bxx = d2basis_xx_eval[mu_i]; // B^2_xx + const double d2bxy = d2basis_xy_eval[mu_i]; // B^2_xy + const double d2bxz = d2basis_xz_eval[mu_i]; // B^2_xz + const double d2byy = d2basis_yy_eval[mu_i]; // B^2_yy + const double d2byz = d2basis_yz_eval[mu_i]; // B^2_yz + const double d2bzz = d2basis_zz_eval[mu_i]; // B^2_zz + + if(is_rks) { + // sum_j B^2_{ij} * d_j n + const auto d2_term_x = d2bxx * ddenn_x + d2bxy * ddenn_y + d2bxz * ddenn_z; + const auto d2_term_y = d2bxy * ddenn_x + d2byy * ddenn_y + d2byz * ddenn_z; + const auto d2_term_z = d2bxz * ddenn_x + d2byz * ddenn_y + d2bzz * ddenn_z; + + // sum_j (d_j n) * xN^j + const double d11_xmat_term = ddenn_x * xNx + ddenn_y * xNy + ddenn_z * xNz; + + g_acc_x += 2 * vgammapp_ipt * ( xN * d2_term_x + dbx * d11_xmat_term ); + g_acc_y += 2 * vgammapp_ipt * ( xN * d2_term_y + dby * d11_xmat_term ); + g_acc_z += 2 * vgammapp_ipt * ( xN * d2_term_z + dbz * d11_xmat_term ); + } else { + // sum_j B^2_{ij} * d_j n + const auto d2n_term_x = d2bxx * ddenn_x + d2bxy * ddenn_y + d2bxz * ddenn_z; + const auto d2n_term_y = d2bxy * ddenn_x + d2byy * ddenn_y + d2byz * ddenn_z; + const auto d2n_term_z = d2bxz * ddenn_x + d2byz * ddenn_y + d2bzz * ddenn_z; + + // sum_j B^2_{ij} * d_j m_z + const auto d2z_term_x = d2bxx * ddenz_x + d2bxy * ddenz_y + d2bxz * ddenz_z; + const auto d2z_term_y = d2bxy * ddenz_x + d2byy * ddenz_y + d2byz * ddenz_z; + const auto d2z_term_z = d2bxz * ddenz_x + d2byz * ddenz_y + d2bzz * ddenz_z; + + // sum_j (d_j n) * xN^j + const double d11nn_xmat_term = ddenn_x * xNx + ddenn_y * xNy + ddenn_z * xNz; + // sum_j (d_j n) * xZ^j + const double d11nz_xmat_term = ddenn_x * xZx + ddenn_y * xZy + ddenn_z * xZz; + // sum_j (d_j m_z) * xN^j + const double d11zn_xmat_term = ddenz_x * xNx + ddenz_y * xNy + ddenz_z * xNz; + // sum_j (d_j m_z) * xZ^j + const double d11zz_xmat_term = ddenz_x * xZx + ddenz_y * xZy + ddenz_z * xZz; + + + g_acc_x += 0.5 * (vgammapp_ipt + vgammapm_ipt + vgammamm_ipt) * (d2n_term_x * xN + d11nn_xmat_term * dbx); + g_acc_x += 0.5 * (vgammapp_ipt - vgammamm_ipt) * (d2z_term_x * xN + d11zn_xmat_term * dbx); + g_acc_x += 0.5 * (vgammapp_ipt - vgammamm_ipt) * (d2n_term_x * xZ + d11nz_xmat_term * dbx); + g_acc_x += 0.5 * (vgammapp_ipt - vgammapm_ipt + vgammamm_ipt) * (d2z_term_x * xZ + d11zz_xmat_term * dbx); + + g_acc_y += 0.5 * (vgammapp_ipt + vgammapm_ipt + vgammamm_ipt) * (d2n_term_y * xN + d11nn_xmat_term * dby); + g_acc_y += 0.5 * (vgammapp_ipt - vgammamm_ipt) * (d2z_term_y * xN + d11zn_xmat_term * dby); + g_acc_y += 0.5 * (vgammapp_ipt - vgammamm_ipt) * (d2n_term_y * xZ + d11nz_xmat_term * dby); + g_acc_y += 0.5 * (vgammapp_ipt - vgammapm_ipt + vgammamm_ipt) * (d2z_term_y * xZ + d11zz_xmat_term * dby); + + g_acc_z += 0.5 * (vgammapp_ipt + vgammapm_ipt + vgammamm_ipt) * (d2n_term_z * xN + d11nn_xmat_term * dbz); + g_acc_z += 0.5 * (vgammapp_ipt - vgammamm_ipt) * (d2z_term_z * xN + d11zn_xmat_term * dbz); + g_acc_z += 0.5 * (vgammapp_ipt - vgammamm_ipt) * (d2n_term_z * xZ + d11nz_xmat_term * dbz); + g_acc_z += 0.5 * (vgammapp_ipt - vgammapm_ipt + vgammamm_ipt) * (d2z_term_z * xZ + d11zz_xmat_term * dbz); + + } + + if( func.is_mgga() ) { + // vtaup is actually vtaun for RKS + const double vtaup_ipt = 0.5 * weights[ipt] * vtau[spin_dim_scal * ipt + 0]; + const double vtaum_ipt = is_uks ? 0.5 * weights[ipt] * vtau[spin_dim_scal * ipt + 1] : 0.0; + + auto d2_term_x = d2bxx * xNx + d2bxy * xNy + d2bxz * xNz; + auto d2_term_y = d2bxy * xNx + d2byy * xNy + d2byz * xNz; + auto d2_term_z = d2bxz * xNx + d2byz * xNy + d2bzz * xNz; + + if(is_rks) { + g_acc_x += vtaup_ipt * d2_term_x; + g_acc_y += vtaup_ipt * d2_term_y; + g_acc_z += vtaup_ipt * d2_term_z; + } else { + const auto vtaun_ipt = vtaup_ipt + vtaum_ipt; + const auto vtauz_ipt = vtaup_ipt - vtaum_ipt; + g_acc_x += 0.5 * vtaun_ipt * d2_term_x; + g_acc_y += 0.5 * vtaun_ipt * d2_term_y; + g_acc_z += 0.5 * vtaun_ipt * d2_term_z; + + d2_term_x = d2bxx * xZx + d2bxy * xZy + d2bxz * xZz; + d2_term_y = d2bxy * xZx + d2byy * xZy + d2byz * xZz; + d2_term_z = d2bxz * xZx + d2byz * xZy + d2bzz * xZz; + + g_acc_x += 0.5 * vtauz_ipt * d2_term_x; + g_acc_y += 0.5 * vtauz_ipt * d2_term_y; + g_acc_z += 0.5 * vtauz_ipt * d2_term_z; + } + + if( needs_laplacian ) { + const double vlapl_ipt = weights[ipt] * vlapl[ipt]; + const double lbf = lbasis_eval[mu_i]; + const double dlbx = dlgradbasis_x_eval[mu_i]; + const double dlby = dlgradbasis_y_eval[mu_i]; + const double dlbz = dlgradbasis_z_eval[mu_i]; + d2_term_x = xN * dlbx + xNx * lbf + 2.0*d2_term_x; + d2_term_y = xN * dlby + xNy * lbf + 2.0*d2_term_y; + d2_term_z = xN * dlbz + xNz * lbf + 2.0*d2_term_z; + + g_acc_x += vlapl_ipt * d2_term_x; + g_acc_y += vlapl_ipt * d2_term_y; + g_acc_z += vlapl_ipt * d2_term_z; + } + } + } } // loop over bfns + grid points #pragma omp atomic @@ -437,10 +580,19 @@ void ReferenceReplicatedXCHostIntegrator:: #pragma omp atomic EXC_GRAD[3*iAt + 2] += -2 * g_acc_z; + if(exc_grad_settings.include_weight_derivatives){ + #pragma omp atomic + EXC_GRAD[3*task.iParent + 0] -= -2 * g_acc_x; + #pragma omp atomic + EXC_GRAD[3*task.iParent + 1] -= -2 * g_acc_y; + #pragma omp atomic + EXC_GRAD[3*task.iParent + 2] -= -2 * g_acc_z; + } + bf_off += sh_sz; // Increment basis offset } // End loop over shells - + } // End loop over tasks } // OpenMP Region diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp index e62ae760..141085c9 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exc_vxc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp index 117bbb5c..7cce12de 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_exx.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp new file mode 100644 index 00000000..192fe0f8 --- /dev/null +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_fxc_contraction.hpp @@ -0,0 +1,620 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once + +#include "reference_replicated_xc_host_integrator.hpp" +#include "integrator_util/integrator_common.hpp" +#include "host/local_host_work_driver.hpp" +#include "host/blas.hpp" +#include + +namespace GauXC::detail { + +/** + * Generic implementation of FXC contraction for RKS/UKS/GKS + * + */ +template +void ReferenceReplicatedXCHostIntegrator:: + eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ){ + + const auto& basis = this->load_balancer_->basis(); + + // Check that P / FXC are sane + const int64_t nbf = basis.nbf(); + if( m != n ) + GAUXC_GENERIC_EXCEPTION("P/FXC Must Be Square"); + if( m != nbf ) + GAUXC_GENERIC_EXCEPTION("P/FXC Must Have Same Dimension as Basis"); + + if( ldps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPS"); + if( ldpz and ldpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDPZ"); + if( ldtps and ldtps < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDTPS"); + if( ldtpz and ldtpz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDTZP"); + if( ldfxcs < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDFXCS"); + if( ldfxcz and ldfxcz < nbf ) + GAUXC_GENERIC_EXCEPTION("Invalid LDFXCZ"); + + + // Get Tasks + auto& tasks = this->load_balancer_->get_tasks(); + + // Temporary electron count to judge integrator accuracy + value_type N_EL; + + // Compute Local contributions to FXC contraction + this->timer_.time_op("XCIntegrator.LocalWork", [&](){ + fxc_contraction_local_work_( basis, Ps, ldps, Pz, ldpz, + tPs, ldtps, tPz, ldtpz, + FXCs, ldfxcs, FXCz, ldfxcz, + &N_EL, ks_settings, + tasks.begin(), tasks.end() ); + }); + + + // Reduce Results + this->timer_.time_op("XCIntegrator.Allreduce", [&](){ + + if( not this->reduction_driver_->takes_host_memory() ) + GAUXC_GENERIC_EXCEPTION("This Module Only Works With Host Reductions"); + + this->reduction_driver_->allreduce_inplace( FXCs, nbf*nbf, ReductionOp::Sum ); + if( FXCz ) this->reduction_driver_->allreduce_inplace( FXCz, nbf*nbf, ReductionOp::Sum ); + + this->reduction_driver_->allreduce_inplace( &N_EL, 1 , ReductionOp::Sum ); + + }); + + +} + +template +void ReferenceReplicatedXCHostIntegrator:: + fxc_contraction_local_work_( const basis_type& basis, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + value_type *N_EL, const IntegratorSettingsXC& settings, + task_iterator task_begin, task_iterator task_end ) { + + const bool is_uks = Pz != nullptr; + const bool is_rks = not is_uks; + + // Misc KS settings + IntegratorSettingsKS ks_settings; + if( auto* tmp = dynamic_cast(&settings) ) { + ks_settings = *tmp; + } + + // Cast LWD to LocalHostWorkDriver + auto* lwd = dynamic_cast(this->local_work_driver_.get()); + + // Setup Aliases + const auto& func = *this->func_; + const auto& mol = this->load_balancer_->molecule(); + + const bool needs_laplacian = func.needs_laplacian(); + // not suppport laplacian yet + if( needs_laplacian ) { + GAUXC_GENERIC_EXCEPTION("Laplacian Not Supported Yet for FXC Contraction"); + } + + // Get basis map + BasisSetMap basis_map(basis,mol); + + const int32_t nbf = basis.nbf(); + + // Sort tasks on size (XXX: maybe doesnt matter?) + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + + auto& tasks = this->load_balancer_->get_tasks(); + std::sort( task_begin, task_end, task_comparator ); + + // Check that Partition Weights have been calculated + auto& lb_state = this->load_balancer_->state(); + if( not lb_state.modified_weights_are_stored ) { + GAUXC_GENERIC_EXCEPTION("Weights Have Not Been Modified"); + } + + + // Zero out integrands + for( auto j = 0; j < nbf; ++j ) + for( auto i = 0; i < nbf; ++i ) + FXCs[i + j*ldfxcs] = 0.; + + if(FXCz) + for( auto j = 0; j < nbf; ++j ) + for( auto i = 0; i < nbf; ++i ) + FXCz[i + j*ldfxcz] = 0.; + + + // Use FXCs and FXCz to store FXCa and FXCb temporarily + value_type* FXCa = FXCs; + value_type* FXCb = FXCz; + int64_t ldfxca = ldfxcs; + int64_t ldfxcb = ldfxcz; + + double NEL_WORK = 0.0; + + // Loop over tasks + const size_t ntasks = std::distance(task_begin, task_end); + + #pragma omp parallel + { + + XCHostData host_data; // Thread local host data + + #pragma omp for schedule(dynamic) + for( size_t iT = 0; iT < ntasks; ++iT ) { + + //std::cout << iT << "/" << ntasks << std::endl; + //if(is_exc_only) printf("%lu / %lu\n", iT, ntasks); + // Alias current task + const auto& task = *(task_begin + iT); + + // Get tasks constants + const int32_t npts = task.points.size(); + const int32_t nbe = task.bfn_screening.nbe; + const int32_t nshells = task.bfn_screening.shell_list.size(); + + const auto* points = task.points.data()->data(); + const auto* weights = task.weights.data(); + const int32_t* shell_list = task.bfn_screening.shell_list.data(); + + // Allocate enough memory for batch + + const size_t spin_dim_scal = is_rks ? 1 : 2; + const size_t sds = is_rks ? 1 : 2; + const size_t mgga_dim_scal = func.is_mgga() ? 4 : 1; // basis + d1basis + // for second derivatives + const size_t spin_dim_rhorho = is_rks ? 1 : 3; + const size_t spin_dim_gammagamma = is_rks ? 1 : 6; + const size_t spin_dim_rhogamma = is_rks ? 1 : 6; + const size_t spin_dim_rhotau = is_rks ? 1 : 4; + + // Things that every calc needs + host_data.nbe_scr .resize(nbe * nbe); + host_data.zmat .resize(npts * nbe * spin_dim_scal * mgga_dim_scal); + host_data.vrho .resize(npts * spin_dim_scal); + host_data.v2rho2 .resize(npts * spin_dim_rhorho); + host_data.FXC_A .resize(npts * spin_dim_scal); + + // LDA data requirements + if( func.is_lda() ){ + host_data.basis_eval .resize( npts * nbe ); + host_data.den_scr .resize( npts * spin_dim_scal); + host_data.tden_scr .resize( npts * spin_dim_scal); + } + + // GGA data requirements + const size_t gga_dim_scal = is_rks ? 1 : 3; + if( func.is_gga() ){ + host_data.basis_eval .resize( 4 * npts * nbe ); + host_data.den_scr .resize( spin_dim_scal * 4 * npts ); + host_data.tden_scr .resize( spin_dim_scal * 4 * npts ); + host_data.gamma .resize( gga_dim_scal * npts ); + host_data.vgamma .resize( gga_dim_scal * npts ); + + // second derivatives + host_data.v2rhogamma .resize(npts * spin_dim_rhogamma); + host_data.v2gamma2 .resize(npts * spin_dim_gammagamma); + host_data.FXC_B .resize(npts * 3 * spin_dim_scal); + } + + if( func.is_mgga() ){ + + host_data.den_scr .resize( spin_dim_scal * 4 * npts ); + host_data.tden_scr .resize( spin_dim_scal * 4 * npts ); + host_data.gamma .resize( gga_dim_scal * npts ); + host_data.vgamma .resize( gga_dim_scal * npts ); + host_data.tau .resize( npts * spin_dim_scal ); + host_data.vtau .resize( npts * spin_dim_scal ); + + // second derivatives + host_data.v2rhogamma .resize(npts * spin_dim_rhogamma); + host_data.v2rhotau .resize(npts * spin_dim_rhotau); + host_data.v2gamma2 .resize(npts * spin_dim_gammagamma); + host_data.v2gammatau .resize(npts * spin_dim_rhogamma); + host_data.v2tau2 .resize(npts * spin_dim_rhorho); + host_data.ttau .resize(npts * spin_dim_scal); + host_data.FXC_B .resize(npts * 3 * spin_dim_scal); + host_data.FXC_C .resize(npts * spin_dim_scal); + + if ( needs_laplacian ) { + host_data.basis_eval .resize( 11 * npts * nbe ); // basis + grad (3) + hess (6) + lapl + host_data.lapl .resize( spin_dim_scal * npts ); + host_data.vlapl .resize( spin_dim_scal * npts ); + host_data.v2lapl2 .resize(npts * spin_dim_rhorho); + host_data.v2rholapl .resize(npts * spin_dim_rhotau); + host_data.v2gammalapl.resize(npts * spin_dim_rhogamma); + host_data.v2lapltau .resize(npts * spin_dim_rhotau); + host_data.tlapl .resize(npts * spin_dim_scal); + + } else { + host_data.basis_eval .resize( 4 * npts * nbe ); // basis + grad (3) + } + } + + + // Alias/Partition out scratch memory + auto* basis_eval = host_data.basis_eval.data(); + auto* den_eval = host_data.den_scr.data(); + auto* tden_eval = host_data.tden_scr.data(); // trial density and gradient + auto* nbe_scr = host_data.nbe_scr.data(); + auto* zmat = host_data.zmat.data(); + + decltype(zmat) zmat_z = nullptr; + if(!is_rks) { + zmat_z = zmat + mgga_dim_scal * nbe * npts; + } + + auto* eps = host_data.eps.data(); + auto* gamma = host_data.gamma.data(); + auto* tau = host_data.tau.data(); + auto* lapl = host_data.lapl.data(); + auto* vrho = host_data.vrho.data(); + auto* vgamma = host_data.vgamma.data(); + auto* vtau = host_data.vtau.data(); + auto* vlapl = host_data.vlapl.data(); + + // second derivatives + auto* v2rho2 = host_data.v2rho2.data(); + auto* v2rhogamma = host_data.v2rhogamma.data(); + auto* v2gamma2 = host_data.v2gamma2.data(); + auto* v2gammatau = host_data.v2gammatau.data(); + auto* v2rhotau = host_data.v2rhotau.data(); + auto* v2lapl2 = host_data.v2lapl2.data(); + auto* v2rholapl = host_data.v2rholapl.data(); + auto* v2gammalapl= host_data.v2gammalapl.data(); + auto* v2lapltau = host_data.v2lapltau.data(); + auto* v2tau2 = host_data.v2tau2.data(); + auto* ttau = host_data.ttau.data(); + auto* tlapl = host_data.tlapl.data(); + auto* FXC_A = host_data.FXC_A.data(); + auto* FXC_B = host_data.FXC_B.data(); + auto* FXC_C = host_data.FXC_C.data(); + + + value_type* dbasis_x_eval = nullptr; + value_type* dbasis_y_eval = nullptr; + value_type* dbasis_z_eval = nullptr; + value_type* d2basis_xx_eval = nullptr; + value_type* d2basis_xy_eval = nullptr; + value_type* d2basis_xz_eval = nullptr; + value_type* d2basis_yy_eval = nullptr; + value_type* d2basis_yz_eval = nullptr; + value_type* d2basis_zz_eval = nullptr; + value_type* lbasis_eval = nullptr; + value_type* dden_x_eval = nullptr; + value_type* dden_y_eval = nullptr; + value_type* dden_z_eval = nullptr; + value_type* tdden_x_eval = nullptr; + value_type* tdden_y_eval = nullptr; + value_type* tdden_z_eval = nullptr; + value_type* mmat_x = nullptr; + value_type* mmat_y = nullptr; + value_type* mmat_z = nullptr; + value_type* mmat_x_z = nullptr; + value_type* mmat_y_z = nullptr; + value_type* mmat_z_z = nullptr; + + if( func.is_gga() || func.is_mgga() ) { + dbasis_x_eval = basis_eval + npts * nbe; + dbasis_y_eval = dbasis_x_eval + npts * nbe; + dbasis_z_eval = dbasis_y_eval + npts * nbe; + dden_x_eval = den_eval + spin_dim_scal * npts; + dden_y_eval = dden_x_eval + spin_dim_scal * npts; + dden_z_eval = dden_y_eval + spin_dim_scal * npts; + tdden_x_eval = tden_eval + spin_dim_scal * npts; + tdden_y_eval = tdden_x_eval+ spin_dim_scal * npts; + tdden_z_eval = tdden_y_eval+ spin_dim_scal * npts; + } + + if ( func.is_mgga() ) { + mmat_x = zmat + npts * nbe; + mmat_y = mmat_x + npts * nbe; + mmat_z = mmat_y + npts * nbe; + if ( needs_laplacian ) { + d2basis_xx_eval = dbasis_z_eval + npts * nbe; + d2basis_xy_eval = d2basis_xx_eval + npts * nbe; + d2basis_xz_eval = d2basis_xy_eval + npts * nbe; + d2basis_yy_eval = d2basis_xz_eval + npts * nbe; + d2basis_yz_eval = d2basis_yy_eval + npts * nbe; + d2basis_zz_eval = d2basis_yz_eval + npts * nbe; + lbasis_eval = d2basis_zz_eval + npts * nbe; + } + if(is_uks) { + mmat_x_z = zmat_z + npts * nbe; + mmat_y_z = mmat_x_z + npts * nbe; + mmat_z_z = mmat_y_z + npts * nbe; + } + } + + + // Get the submatrix map for batch + std::vector< std::array > submat_map; + std::tie(submat_map, std::ignore) = + gen_compressed_submat_map(basis_map, task.bfn_screening.shell_list, nbf, nbf); + + // Evaluate Collocation (+ Grad and Hessian) + if( func.is_mgga() ) { + if ( needs_laplacian ) { + // TODO: Modify gau2grid to compute Laplacian instead of full hessian + lwd->eval_collocation_hessian( npts, nshells, nbe, points, basis, shell_list, + basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, d2basis_xx_eval, + d2basis_xy_eval, d2basis_xz_eval, d2basis_yy_eval, d2basis_yz_eval, + d2basis_zz_eval); + blas::lacpy( 'A', nbe, npts, d2basis_xx_eval, nbe, lbasis_eval, nbe ); + blas::axpy( nbe * npts, 1., d2basis_yy_eval, 1, lbasis_eval, 1); + blas::axpy( nbe * npts, 1., d2basis_zz_eval, 1, lbasis_eval, 1); + } else { + lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, + basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval ); + } + } + // Evaluate Collocation (+ Grad) + else if( func.is_gga() ) + lwd->eval_collocation_gradient( npts, nshells, nbe, points, basis, shell_list, + basis_eval, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval ); + else + lwd->eval_collocation( npts, nshells, nbe, points, basis, shell_list, + basis_eval ); + + + // Evaluate X matrix (fac * P * B) -> store in Z + const auto xmat_fac = is_rks ? 2.0 : 1.0; // TODO Fix for spinor RKS input + lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, Ps, ldps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + // X matrix for Pz + if(not is_rks) { + lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, 1.0, Pz, ldpz, basis_eval, nbe, + zmat_z, nbe, nbe_scr); + } + + // Evaluate U and V variables + if( func.is_mgga() ) { + if (is_rks) { + lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, + nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl); + } else if (is_uks) { + lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, zmat, nbe, zmat_z, nbe, + mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe, + den_eval, dden_x_eval, dden_y_eval, dden_z_eval, gamma, tau, lapl); + } + } else if ( func.is_gga() ) { + if(is_rks) { + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, den_eval, dden_x_eval, dden_y_eval, dden_z_eval, + gamma ); + } else if(is_uks) { + lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, zmat_z, nbe, den_eval, dden_x_eval, + dden_y_eval, dden_z_eval, gamma ); + } + + } else { + if(is_rks) { + lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, den_eval ); + } else if(is_uks) { + lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, zmat, nbe, zmat_z, nbe, + den_eval ); + } + } + + // Evaluate XC functional + if( func.is_mgga() ) + func.eval_vxc_fxc( npts, den_eval, gamma, lapl, tau, vrho, vgamma, vlapl, vtau, + v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, + v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2); + else if( func.is_gga() ) + func.eval_vxc_fxc( npts, den_eval, gamma, vrho, vgamma, v2rho2, v2rhogamma, v2gamma2 ); + else + func.eval_vxc_fxc( npts, den_eval, vrho, v2rho2 ); + + //calculate the trial density variables + // Evaluate X matrix (fac * tP * B) -> store in Z + lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, xmat_fac, tPs, ldps, basis_eval, nbe, + zmat, nbe, nbe_scr ); + // X matrix for tPz + if(not is_rks) { + lwd->eval_xmat( mgga_dim_scal * npts, nbf, nbe, submat_map, 1.0, tPz, ldpz, basis_eval, nbe, + zmat_z, nbe, nbe_scr); + } + // Evaluate U and V trial variables + if( func.is_mgga() ) { + if (is_rks) { + lwd->eval_uvvar_mgga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, zmat, nbe, mmat_x, mmat_y, mmat_z, + nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl); + lwd->eval_tmat_mgga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, + v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C ); + } else if (is_uks) { + // tgamma is not needed since it has different definitions than gamma + // gamma = nabla rho * nabla rho, but tgamma = nabla trho * nabla rho, not both trho + lwd->eval_uvvar_mgga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, lbasis_eval, zmat, nbe, zmat_z, nbe, + mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe, + tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, gamma, ttau, tlapl); + lwd->eval_tmat_mgga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2rholapl, v2rhotau, v2gamma2, + v2gammalapl, v2gammatau, v2lapl2, v2lapltau, v2tau2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, ttau, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B, FXC_C ); + } + } else if ( func.is_gga() ) { + if(is_rks) { + lwd->eval_uvvar_gga_rks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, tden_eval, tdden_x_eval, tdden_y_eval, tdden_z_eval, + gamma ); + lwd->eval_tmat_gga_vxc_rks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B ); + } else if(is_uks) { + // tgamma is not needed since it has quite different definitions than gamma + lwd->eval_uvvar_gga_uks( npts, nbe, basis_eval, dbasis_x_eval, dbasis_y_eval, + dbasis_z_eval, zmat, nbe, zmat_z, nbe, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, gamma ); + lwd->eval_tmat_gga_vxc_uks( npts, vgamma, v2rho2, v2rhogamma, v2gamma2, tden_eval, tdden_x_eval, + tdden_y_eval, tdden_z_eval, dden_x_eval, dden_y_eval, dden_z_eval, FXC_A, FXC_B ); + } + } else { + // LDA + if(is_rks) { + lwd->eval_uvvar_lda_rks( npts, nbe, basis_eval, zmat, nbe, tden_eval ); + lwd->eval_tmat_lda_vxc_rks( npts, v2rho2, tden_eval, FXC_A); + } else if(is_uks) { + lwd->eval_uvvar_lda_uks( npts, nbe, basis_eval, zmat, nbe, zmat_z, nbe, + tden_eval ); + lwd->eval_tmat_lda_vxc_uks( npts, v2rho2, tden_eval, FXC_A); + } + } + + // Factor weights into XC results + for( int32_t i = 0; i < npts; ++i ) { + FXC_A[sds*i] *= weights[i]; + if(not is_rks) FXC_A[sds*i+1] *= weights[i]; + } + if( func.is_gga() || func.is_mgga()){ + for( int32_t i = 0; i < npts; ++i ) { + FXC_B[3*sds*i] *= weights[i]; + FXC_B[3*sds*i+1] *= weights[i]; + FXC_B[3*sds*i+2] *= weights[i]; + if(not is_rks) { + FXC_B[3*sds*i+3] *= weights[i]; + FXC_B[3*sds*i+4] *= weights[i]; + FXC_B[3*sds*i+5] *= weights[i]; + } + } + } + if( func.is_mgga() ){ + for( int32_t i = 0; i < npts; ++i) { + FXC_C[sds*i] *= weights[i]; + if(not is_rks) FXC_C[sds*i+1] *= weights[i]; + } + } + + // Scalar integrations + double NEL_local = 0.0; + for( int32_t i = 0; i < npts; ++i ) { + const auto den = is_rks ? den_eval[i] : (den_eval[2*i] + den_eval[2*i+1]); + NEL_local += weights[i] * den; + } + + + // Atomic updates + #pragma omp atomic + NEL_WORK += NEL_local; + // Evaluate Z matrix for VXC + if( func.is_mgga() ) { + if(is_rks) { + // Because we do not support Laplacian, so mgga will do the same operation as GGA + lwd->eval_zmat_gga_vxc_rks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, zmat, nbe); + lwd->eval_mmat_mgga_vxc_rks( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, + mmat_x, mmat_y, mmat_z, nbe); + } else if (is_uks) { + // Because we do not support Laplacian, so mgga will do the same operation as GGA + lwd->eval_zmat_gga_vxc_uks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, zmat, nbe, zmat_z, nbe); + lwd->eval_mmat_mgga_vxc_uks_ts( npts, nbe, FXC_C, vlapl, dbasis_x_eval, dbasis_y_eval, dbasis_z_eval, + mmat_x, mmat_y, mmat_z, nbe, mmat_x_z, mmat_y_z, mmat_z_z, nbe); + } + } + else if( func.is_gga() ) { + if(is_rks) { + lwd->eval_zmat_gga_vxc_rks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, zmat, nbe); + } else if(is_uks) { + lwd->eval_zmat_gga_vxc_uks_ts( npts, nbe, FXC_A, FXC_B, basis_eval, dbasis_x_eval, + dbasis_y_eval, dbasis_z_eval, zmat, nbe, zmat_z, nbe); + } + + } else { + if(is_rks) { + lwd->eval_zmat_lda_vxc_rks( npts, nbe, FXC_A, basis_eval, zmat, nbe ); + } else if(is_uks) { + lwd->eval_zmat_lda_vxc_uks_ts( npts, nbe, FXC_A, basis_eval, zmat, nbe, zmat_z, nbe ); + } + } + + // Incremeta LT of VXC + { + + // Increment VXC + lwd->inc_vxc( mgga_dim_scal * npts, nbf, nbe, basis_eval, submat_map, zmat, nbe, FXCa, ldfxca, nbe_scr ); + if( not is_rks ) + lwd->inc_vxc( mgga_dim_scal * npts, nbf, nbe, basis_eval, submat_map, zmat_z, nbe, FXCb, ldfxcb, nbe_scr); + } + + } // Loop over tasks + + } // End OpenMP region + + + // Set scalar return values + *N_EL = NEL_WORK; + + // Symmetrize VXC + for( int32_t j = 0; j < nbf; ++j ) + for( int32_t i = j+1; i < nbf; ++i ) + FXCa[ j + i*ldfxca ] = FXCa[ i + j*ldfxca ]; + + if ( FXCz ) + for( int32_t j = 0; j < nbf; ++j ) + for( int32_t i = j+1; i < nbf; ++i ) + FXCb[ j + i*ldfxcb ] = FXCb[ i + j*ldfxcb ]; + + if( FXCz ) + // now convert to the final form of FXCs and FXCz + for ( int32_t j = 0; j < nbf; ++j ) + for( int32_t i = 0; i < nbf; ++i ) { + value_type tmp_a = FXCa[ i + j*ldfxca ]; + value_type tmp_b = FXCb[ i + j*ldfxcb ]; + FXCs[ i + j*ldfxcs ] = 0.5 * ( tmp_a + tmp_b ); + FXCz[ i + j*ldfxcz ] = 0.5 * ( tmp_a - tmp_b ); + } + +} + + + /// RKS FXC contraction +template +void ReferenceReplicatedXCHostIntegrator:: +eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ){ + + eval_fxc_contraction_( m, n, P, ldp, nullptr, 0, tP, ldtp, nullptr, 0, + FXC, ldfxc, nullptr, 0, ks_settings ); +} + + + +} // namespace GauXC::detail diff --git a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp index d327a4ea..e0ad145f 100644 --- a/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp +++ b/src/xc_integrator/replicated/host/reference_replicated_xc_host_integrator_integrate_den.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx index 4fd53aef..72ef87b8 100644 --- a/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx +++ b/src/xc_integrator/replicated/host/replicated_xc_host_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx index 4bdd2c66..c972d30a 100644 --- a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx +++ b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -11,6 +15,9 @@ #include "shell_batched_replicated_xc_integrator_exc_vxc.hpp" #include "shell_batched_replicated_xc_integrator_exc_grad.hpp" #include "shell_batched_replicated_xc_integrator_exx.hpp" +#include "shell_batched_replicated_xc_integrator_fxc_contraction.hpp" +#include "shell_batched_replicated_xc_integrator_dd_psi.hpp" +#include "shell_batched_replicated_xc_integrator_dd_psi_potential.hpp" namespace GauXC { namespace detail { diff --git a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp index 3c3db085..a8f1f488 100644 --- a/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp +++ b/src/xc_integrator/replicated/host/shell_batched_replicated_xc_host_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/replicated/host/xc_host_data.hpp b/src/xc_integrator/replicated/host/xc_host_data.hpp index 5649d523..1c7fc9a2 100644 --- a/src/xc_integrator/replicated/host/xc_host_data.hpp +++ b/src/xc_integrator/replicated/host/xc_host_data.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -30,6 +34,27 @@ struct XCHostData { std::vector nbe_scr; std::vector den_scr; std::vector basis_eval; + + // Second order derivatives + std::vector v2rho2; + std::vector v2rhogamma; + std::vector v2rholapl; + std::vector v2rhotau; + std::vector v2gamma2; + std::vector v2gammalapl; + std::vector v2gammatau; + std::vector v2lapl2; + std::vector v2lapltau; + std::vector v2tau2; + + // For Fxc contraction + std::vector FXC_A; + std::vector FXC_B; + std::vector FXC_C; + std::vector tden_scr; + std::vector ttau; + std::vector tlapl; + inline XCHostData() {} diff --git a/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx b/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx index b1d50523..071afe31 100644 --- a/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx +++ b/src/xc_integrator/replicated/replicated_xc_integrator_impl.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -120,9 +124,19 @@ void ReplicatedXCIntegratorImpl:: template void ReplicatedXCIntegratorImpl:: eval_exc_grad( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) { + int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { - eval_exc_grad_(m,n,P,ldp,EXC_GRAD); + eval_exc_grad_(m,n,P,ldp,EXC_GRAD, ks_settings); + +} + + +template +void ReplicatedXCIntegratorImpl:: + eval_exc_grad( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& ks_settings ) { + + eval_exc_grad_(m,n,Ps,ldps,Pz,ldpz,EXC_GRAD, ks_settings); } @@ -136,6 +150,66 @@ void ReplicatedXCIntegratorImpl:: } +template +void ReplicatedXCIntegratorImpl:: +eval_fxc_contraction( int64_t m, int64_t n, const value_type* P, + int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ) { + + // For RKS, we can reuse the UKS implementation with Pz=0, tPz=0 + // Create temporary buffers to store the z-component results + std::vector temp_fxcz(m * n, 0.0); + value_type* FXCz = temp_fxcz.data(); + int64_t ldfxcz = m; + + eval_fxc_contraction_(m, n, P, ldp, + tP, ldtp, + FXC, ldfxc, + ks_settings); + +} + +template +void ReplicatedXCIntegratorImpl:: +eval_fxc_contraction( int64_t m, int64_t n, const value_type* Ps, + int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ) { + + eval_fxc_contraction_(m,n,Ps,ldps, + Pz,ldpz, + tPs,ldtps, + tPz,ldtpz, + FXCs,ldfxcs, + FXCz,ldfxcz, + ks_settings); + +} + +template +void ReplicatedXCIntegratorImpl:: + eval_dd_psi( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) { + + eval_dd_psi_(m, n, P, ldp, max_Ylm, ddPsi, ldPsi); + +} + +template +void ReplicatedXCIntegratorImpl:: + eval_dd_psi_potential( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx) { + + eval_dd_psi_potential_(m, n, X, max_Ylm, Vddx); + +} + + template class ReplicatedXCIntegratorImpl; } diff --git a/src/xc_integrator/shell_batched/CMakeLists.txt b/src/xc_integrator/shell_batched/CMakeLists.txt index 636666c4..771124a4 100644 --- a/src/xc_integrator/shell_batched/CMakeLists.txt +++ b/src/xc_integrator/shell_batched/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp index c6201a73..5c1d4a94 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -83,15 +87,40 @@ class ShellBatchedReplicatedXCIntegrator : /// RKS EXC Gradient - void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) override; + void eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, + value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override; + /// UKS EXC Gradient + void eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) override; /// sn-LinK void eval_exx_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* K, int64_t ldk, const IntegratorSettingsEXX& settings ) override; - + // RKS FXC contraction + void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ) override; + + // UKS FXC contraction + void eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ) override; + + /// ddX PSi + void eval_dd_psi_( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, value_type* ddPsi, int64_t ldPsi ) override; + + /// ddX PhiX + void eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx ) override; // Implementation details of exc_vxc (for RKS/UKS/GKS deduced from input character) diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi.hpp new file mode 100644 index 00000000..689e16a7 --- /dev/null +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi.hpp @@ -0,0 +1,30 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "shell_batched_replicated_xc_integrator.hpp" +#include +#include + +namespace GauXC { +namespace detail { + +template +void ShellBatchedReplicatedXCIntegrator:: + eval_dd_psi_( int64_t m, int64_t n, const value_type* P, + int64_t ldp, unsigned max_Ylm, + value_type* ddPsi, int64_t ldPsi ) { + GAUXC_GENERIC_EXCEPTION("ShellBatched DD-PSI NYI"); + util::unused(m,n,P,ldp, max_Ylm, ddPsi,ldPsi); +} + +} +} diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi_potential.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi_potential.hpp new file mode 100644 index 00000000..639508b2 --- /dev/null +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_dd_psi_potential.hpp @@ -0,0 +1,28 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "shell_batched_replicated_xc_integrator.hpp" +#include +#include + +namespace GauXC { +namespace detail { + +template +void ShellBatchedReplicatedXCIntegrator:: + eval_dd_psi_potential_( int64_t m, int64_t n, const value_type* X, unsigned max_Ylm, value_type* Vddx ) { + GAUXC_GENERIC_EXCEPTION("ShellBatched DD-PSI-DERIV NYI"); + util::unused(m,n,X,max_Ylm, Vddx); +} + +} +} diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp index 4635336a..2a5565c9 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp index dde98bdd..f329bc02 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_grad.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -15,12 +19,20 @@ namespace detail { template void ShellBatchedReplicatedXCIntegrator:: - eval_exc_grad_( int64_t m, int64_t n, const value_type* P, - int64_t ldp, value_type* EXC_GRAD ) { + eval_exc_grad_( int64_t m, int64_t n, const value_type* P, int64_t ldp, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { GAUXC_GENERIC_EXCEPTION("ShellBatched exc_grad NYI" ); util::unused(m,n,P,ldp,EXC_GRAD); } +template +void ShellBatchedReplicatedXCIntegrator:: + eval_exc_grad_( int64_t m, int64_t n, const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t lpdz, value_type* EXC_GRAD, const IntegratorSettingsXC& settings ) { + + GAUXC_GENERIC_EXCEPTION("ShellBatched exc_grad NYI" ); + util::unused(m,n,Ps,ldps,Pz,lpdz,EXC_GRAD); +} + } } diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp index 5a65be8e..3dd43f4d 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exc_vxc.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp index 0db24197..e6e90f8d 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_exx.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp new file mode 100644 index 00000000..289de960 --- /dev/null +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_fxc_contraction.hpp @@ -0,0 +1,50 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#pragma once +#include "shell_batched_replicated_xc_integrator.hpp" +#include +#include + +namespace GauXC { +namespace detail { + +template +void ShellBatchedReplicatedXCIntegrator:: + eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* P, int64_t ldp, + const value_type* tP, int64_t ldtp, + value_type* FXC, int64_t ldfxc, + const IntegratorSettingsXC& ks_settings ) { + GAUXC_GENERIC_EXCEPTION("ShellBatched FXC contraction NYI"); + util::unused(m,n,P,ldp,tP,ldtp,FXC,ldfxc,ks_settings); + +} + +template +void ShellBatchedReplicatedXCIntegrator:: + eval_fxc_contraction_( int64_t m, int64_t n, + const value_type* Ps, int64_t ldps, + const value_type* Pz, int64_t ldpz, + const value_type* tPs, int64_t ldtps, + const value_type* tPz, int64_t ldtpz, + value_type* FXCs, int64_t ldfxcs, + value_type* FXCz, int64_t ldfxcz, + const IntegratorSettingsXC& ks_settings ) { + GAUXC_GENERIC_EXCEPTION("ShellBatched FXC contraction NYI"); + util::unused(m,n,Ps,ldps,Pz,ldpz,tPs,ldtps,tPz,ldtpz, + FXCs,ldfxcs,FXCz,ldfxcz); + +} + + +} +} diff --git a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp index ce9194d1..e0a24504 100644 --- a/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_replicated_xc_integrator_integrate_den.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx index 314f0027..4d5a3156 100644 --- a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx +++ b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp index 1d04169d..c528e067 100644 --- a/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp +++ b/src/xc_integrator/shell_batched/shell_batched_xc_integrator.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/xc_data/CMakeLists.txt b/src/xc_integrator/xc_data/CMakeLists.txt index 711dac33..f06826e1 100644 --- a/src/xc_integrator/xc_data/CMakeLists.txt +++ b/src/xc_integrator/xc_data/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/xc_data/buffer_adaptor.hpp b/src/xc_integrator/xc_data/buffer_adaptor.hpp index 49179886..741aaaec 100644 --- a/src/xc_integrator/xc_data/buffer_adaptor.hpp +++ b/src/xc_integrator/xc_data/buffer_adaptor.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/xc_data/device/CMakeLists.txt b/src/xc_integrator/xc_data/device/CMakeLists.txt index d2c79570..571a7cf6 100644 --- a/src/xc_integrator/xc_data/device/CMakeLists.txt +++ b/src/xc_integrator/xc_data/device/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx b/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx index af985115..2e043842 100644 --- a/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx +++ b/src/xc_integrator/xc_data/device/xc_device_aos_data.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -51,10 +55,11 @@ size_t XCDeviceAoSData::get_mem_req( integrator_term_tracker terms, return base_size + // Collocation + Derivatives - reqt.task_bfn_size ( nbe_bfn, npts ) * sizeof(double) + - reqt.task_bfn_grad_size( nbe_bfn, npts ) * sizeof(double) + - reqt.task_bfn_hess_size( nbe_bfn, npts ) * sizeof(double) + - reqt.task_bfn_lapl_size( nbe_bfn, npts ) * sizeof(double) + + reqt.task_bfn_size ( nbe_bfn, npts ) * sizeof(double) + + reqt.task_bfn_grad_size( nbe_bfn, npts ) * sizeof(double) + + reqt.task_bfn_hess_size( nbe_bfn, npts ) * sizeof(double) + + reqt.task_bfn_lapl_size( nbe_bfn, npts ) * sizeof(double) + + reqt.task_bfn_lapgrad_size( nbe_bfn, npts ) * sizeof(double) + // LDA/GGA Z Matrix reqt.task_zmat_size( nbe_bfn, npts ) * sizeof(double) + @@ -62,6 +67,9 @@ size_t XCDeviceAoSData::get_mem_req( integrator_term_tracker terms, // X Matrix Gradient reqt.task_xmat_grad_size( nbe_bfn, npts ) * sizeof(double) + + // Persistent X Mat + reqt.task_xmat_persist_size( nbe_bfn, npts ) * sizeof(double) + + // EXX Intermediates reqt.task_fmat_size( nbe_cou, npts ) * sizeof(double) + reqt.task_gmat_size( nbe_cou, npts ) * sizeof(double) + @@ -191,6 +199,12 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack( aos_stack.d2bf_lapl_eval_device = mem.aligned_alloc( bfn_msz, csl ); } + if(reqt.task_bfn_lapgrad) { + aos_stack.d3bf_lapgrad_x_eval_device = mem.aligned_alloc( bfn_msz, csl ); + aos_stack.d3bf_lapgrad_y_eval_device = mem.aligned_alloc( bfn_msz, csl ); + aos_stack.d3bf_lapgrad_z_eval_device = mem.aligned_alloc( bfn_msz, csl ); + } + // VXC Z Matrix if(reqt.task_zmat) { aos_stack.zmat_vxc_device = @@ -203,6 +217,20 @@ XCDeviceAoSData::device_buffer_t XCDeviceAoSData::allocate_dynamic_stack( aos_stack.xmat_dz_device = mem.aligned_alloc( bfn_msz, csl); } + // Persistent X Matrix Gradient + if(reqt.task_xmat_persist) { + aos_stack.xmatS_device = mem.aligned_alloc( bfn_msz, csl); + aos_stack.xmatZ_device = mem.aligned_alloc( bfn_msz, csl); + if(reqt.task_xmat_grad) { + aos_stack.xmatS_dx_device = mem.aligned_alloc( bfn_msz, csl); + aos_stack.xmatS_dy_device = mem.aligned_alloc( bfn_msz, csl); + aos_stack.xmatS_dz_device = mem.aligned_alloc( bfn_msz, csl); + aos_stack.xmatZ_dx_device = mem.aligned_alloc( bfn_msz, csl); + aos_stack.xmatZ_dy_device = mem.aligned_alloc( bfn_msz, csl); + aos_stack.xmatZ_dz_device = mem.aligned_alloc( bfn_msz, csl); + } + } + // EXX Intermediates if(reqt.task_fmat) { aos_stack.fmat_exx_device = @@ -466,9 +494,26 @@ void XCDeviceAoSData::pack_and_send( buffer_adaptor d2bf_lapl_mem( aos_stack.d2bf_lapl_eval_device, total_nbe_bfn_npts ); + buffer_adaptor d3bf_lapgrad_x_mem( aos_stack.d3bf_lapgrad_x_eval_device, + total_nbe_bfn_npts ); + buffer_adaptor d3bf_lapgrad_y_mem( aos_stack.d3bf_lapgrad_y_eval_device, + total_nbe_bfn_npts ); + buffer_adaptor d3bf_lapgrad_z_mem( aos_stack.d3bf_lapgrad_z_eval_device, + total_nbe_bfn_npts ); + buffer_adaptor xmat_dx_mem( aos_stack.xmat_dx_device, total_nbe_bfn_npts ); buffer_adaptor xmat_dy_mem( aos_stack.xmat_dy_device, total_nbe_bfn_npts ); buffer_adaptor xmat_dz_mem( aos_stack.xmat_dz_device, total_nbe_bfn_npts ); + + buffer_adaptor xmatS_mem( aos_stack.xmatS_device, total_nbe_bfn_npts ); + buffer_adaptor xmatS_dx_mem( aos_stack.xmatS_dx_device, total_nbe_bfn_npts ); + buffer_adaptor xmatS_dy_mem( aos_stack.xmatS_dy_device, total_nbe_bfn_npts ); + buffer_adaptor xmatS_dz_mem( aos_stack.xmatS_dz_device, total_nbe_bfn_npts ); + + buffer_adaptor xmatZ_mem( aos_stack.xmatZ_device, total_nbe_bfn_npts ); + buffer_adaptor xmatZ_dx_mem( aos_stack.xmatZ_dx_device, total_nbe_bfn_npts ); + buffer_adaptor xmatZ_dy_mem( aos_stack.xmatZ_dy_device, total_nbe_bfn_npts ); + buffer_adaptor xmatZ_dz_mem( aos_stack.xmatZ_dz_device, total_nbe_bfn_npts ); const bool is_rks = terms.ks_scheme == RKS; const bool is_uks = terms.ks_scheme == UKS; @@ -477,38 +522,52 @@ void XCDeviceAoSData::pack_and_send( const bool is_gga = terms.xc_approx == GGA; const int den_fac = is_pol ? 2 : 1; const int gamma_fac = is_pol ? 3 : 1; - + // second derivative + const int rhorho_fac = is_pol ? 3 : 1; + const int rhogamma_fac = is_pol ? 6 : 1; + const int rhotau_fac = is_pol ? 4 : 1; buffer_adaptor eps_mem ( base_stack.eps_eval_device, total_npts ); // RKS - buffer_adaptor den_s_mem ( base_stack.den_s_eval_device, total_npts ); - buffer_adaptor gamma_mem ( base_stack.gamma_eval_device, total_npts * gamma_fac ); - buffer_adaptor vrho_mem ( base_stack.vrho_eval_device, total_npts * den_fac ); - buffer_adaptor vgamma_mem ( base_stack.vgamma_eval_device, total_npts * gamma_fac ); - - buffer_adaptor den_mem ( base_stack.den_eval_device, total_npts * den_fac ); - + buffer_adaptor den_s_mem ( base_stack.den_s_eval_device, total_npts ); + buffer_adaptor tau_s_mem ( base_stack.tau_s_eval_device, total_npts ); + buffer_adaptor lapl_s_mem ( base_stack.lapl_s_eval_device, total_npts ); + buffer_adaptor gamma_mem ( base_stack.gamma_eval_device, total_npts * gamma_fac ); + buffer_adaptor vrho_mem ( base_stack.vrho_eval_device, total_npts * den_fac ); + buffer_adaptor vgamma_mem ( base_stack.vgamma_eval_device, total_npts * gamma_fac ); + buffer_adaptor vtau_mem ( base_stack.vtau_eval_device, total_npts * den_fac ); + buffer_adaptor vlapl_mem ( base_stack.vlapl_eval_device, total_npts * den_fac ); // Polarized KS - buffer_adaptor den_z_mem ( base_stack.den_z_eval_device, total_npts ); - buffer_adaptor den_y_mem ( base_stack.den_y_eval_device, total_npts ); - buffer_adaptor den_x_mem ( base_stack.den_x_eval_device, total_npts ); + buffer_adaptor den_interleaved_mem ( base_stack.den_interleaved_device, total_npts * den_fac ); + buffer_adaptor tau_interleaved_mem ( base_stack.tau_interleaved_device, total_npts * den_fac ); + buffer_adaptor lapl_interleaved_mem ( base_stack.lapl_interleaved_device, total_npts * den_fac ); + buffer_adaptor den_z_mem ( base_stack.den_z_eval_device, total_npts ); + buffer_adaptor den_y_mem ( base_stack.den_y_eval_device, total_npts ); + buffer_adaptor den_x_mem ( base_stack.den_x_eval_device, total_npts ); + buffer_adaptor tau_z_mem ( base_stack.tau_z_eval_device, total_npts ); + buffer_adaptor lapl_z_mem ( base_stack.lapl_z_eval_device, total_npts ); + buffer_adaptor vrho_pos_mem( base_stack.vrho_pos_eval_device, total_npts ); buffer_adaptor vrho_neg_mem( base_stack.vrho_neg_eval_device, total_npts ); - buffer_adaptor K_z_mem ( base_stack.K_z_eval_device, total_npts ); - buffer_adaptor K_y_mem ( base_stack.K_y_eval_device, total_npts ); - buffer_adaptor K_x_mem ( base_stack.K_x_eval_device, total_npts ); - buffer_adaptor H_z_mem ( base_stack.H_z_eval_device, total_npts ); - buffer_adaptor H_y_mem ( base_stack.H_y_eval_device, total_npts ); - buffer_adaptor H_x_mem ( base_stack.H_x_eval_device, total_npts ); + buffer_adaptor vtau_pos_mem( base_stack.vtau_pos_eval_device, total_npts ); + buffer_adaptor vtau_neg_mem( base_stack.vtau_neg_eval_device, total_npts ); + buffer_adaptor vlapl_pos_mem( base_stack.vlapl_pos_eval_device, total_npts ); + buffer_adaptor vlapl_neg_mem( base_stack.vlapl_neg_eval_device, total_npts ); buffer_adaptor gamma_pp_mem( base_stack.gamma_pp_eval_device, total_npts ); buffer_adaptor gamma_pm_mem( base_stack.gamma_pm_eval_device, total_npts ); buffer_adaptor gamma_mm_mem( base_stack.gamma_mm_eval_device, total_npts ); buffer_adaptor vgamma_pp_mem( base_stack.vgamma_pp_eval_device, total_npts ); buffer_adaptor vgamma_pm_mem( base_stack.vgamma_pm_eval_device, total_npts ); buffer_adaptor vgamma_mm_mem( base_stack.vgamma_mm_eval_device, total_npts ); + buffer_adaptor K_z_mem ( base_stack.K_z_eval_device, total_npts ); + buffer_adaptor K_y_mem ( base_stack.K_y_eval_device, total_npts ); + buffer_adaptor K_x_mem ( base_stack.K_x_eval_device, total_npts ); + buffer_adaptor H_z_mem ( base_stack.H_z_eval_device, total_npts ); + buffer_adaptor H_y_mem ( base_stack.H_y_eval_device, total_npts ); + buffer_adaptor H_x_mem ( base_stack.H_x_eval_device, total_npts ); // Gradients buffer_adaptor dden_sx_mem( base_stack.dden_sx_eval_device, total_npts ); @@ -523,12 +582,101 @@ void XCDeviceAoSData::pack_and_send( buffer_adaptor dden_xx_mem( base_stack.dden_xx_eval_device, total_npts ); buffer_adaptor dden_xy_mem( base_stack.dden_xy_eval_device, total_npts ); buffer_adaptor dden_xz_mem( base_stack.dden_xz_eval_device, total_npts ); - - // MGGA - buffer_adaptor dden_lapl_mem( base_stack.den_lapl_eval_device, total_npts ); - buffer_adaptor vlapl_mem( base_stack.vlapl_eval_device, total_npts ); - buffer_adaptor tau_mem( base_stack.tau_eval_device, total_npts ); - buffer_adaptor vtau_mem( base_stack.vtau_eval_device, total_npts ); + + // second derivative + // RKS + buffer_adaptor tden_s_mem( base_stack.tden_s_eval_device, total_npts ); + buffer_adaptor ttau_s_mem( base_stack.ttau_s_eval_device, total_npts ); + buffer_adaptor tlapl_s_mem( base_stack.tlapl_s_eval_device, total_npts ); + buffer_adaptor v2rho2_mem( base_stack.v2rho2_eval_device, total_npts * rhorho_fac ); + buffer_adaptor v2rhogamma_mem( base_stack.v2rhogamma_eval_device, total_npts * rhogamma_fac ); + buffer_adaptor v2rholapl_mem( base_stack.v2rholapl_eval_device, total_npts * rhotau_fac ); + buffer_adaptor v2rhotau_mem( base_stack.v2rhotau_eval_device, total_npts * rhotau_fac ); + buffer_adaptor v2gamma2_mem( base_stack.v2gamma2_eval_device, total_npts * rhogamma_fac ); + buffer_adaptor v2gammalapl_mem( base_stack.v2gammalapl_eval_device, total_npts * rhogamma_fac ); + buffer_adaptor v2gammatau_mem( base_stack.v2gammatau_eval_device, total_npts * rhogamma_fac ); + buffer_adaptor v2lapl2_mem( base_stack.v2lapl2_eval_device, total_npts * rhorho_fac ); + buffer_adaptor v2lapltau_mem( base_stack.v2lapltau_eval_device, total_npts * rhotau_fac ); + buffer_adaptor v2tau2_mem( base_stack.v2tau2_eval_device, total_npts * rhorho_fac ); + + // Polarized KS + buffer_adaptor tden_z_mem( base_stack.tden_z_eval_device, total_npts ); + buffer_adaptor tden_y_mem( base_stack.tden_y_eval_device, total_npts ); + buffer_adaptor tden_x_mem( base_stack.tden_x_eval_device, total_npts ); + buffer_adaptor ttau_z_mem( base_stack.ttau_z_eval_device, total_npts ); + buffer_adaptor tlapl_z_mem( base_stack.tlapl_z_eval_device, total_npts ); + + buffer_adaptor v2rho2_a_a_mem( base_stack.v2rho2_a_a_eval_device, total_npts ); + buffer_adaptor v2rho2_a_b_mem( base_stack.v2rho2_a_b_eval_device, total_npts ); + buffer_adaptor v2rho2_b_b_mem( base_stack.v2rho2_b_b_eval_device, total_npts ); + buffer_adaptor v2rhogamma_a_aa_mem( base_stack.v2rhogamma_a_aa_eval_device, total_npts ); + buffer_adaptor v2rhogamma_a_ab_mem( base_stack.v2rhogamma_a_ab_eval_device, total_npts ); + buffer_adaptor v2rhogamma_a_bb_mem( base_stack.v2rhogamma_a_bb_eval_device, total_npts ); + buffer_adaptor v2rhogamma_b_aa_mem( base_stack.v2rhogamma_b_aa_eval_device, total_npts ); + buffer_adaptor v2rhogamma_b_ab_mem( base_stack.v2rhogamma_b_ab_eval_device, total_npts ); + buffer_adaptor v2rhogamma_b_bb_mem( base_stack.v2rhogamma_b_bb_eval_device, total_npts ); + buffer_adaptor v2rholapl_a_a_mem( base_stack.v2rholapl_a_a_eval_device, total_npts ); + buffer_adaptor v2rholapl_a_b_mem( base_stack.v2rholapl_a_b_eval_device, total_npts ); + buffer_adaptor v2rholapl_b_a_mem( base_stack.v2rholapl_b_a_eval_device, total_npts ); + buffer_adaptor v2rholapl_b_b_mem( base_stack.v2rholapl_b_b_eval_device, total_npts ); + buffer_adaptor v2rhotau_a_a_mem( base_stack.v2rhotau_a_a_eval_device, total_npts ); + buffer_adaptor v2rhotau_a_b_mem( base_stack.v2rhotau_a_b_eval_device, total_npts ); + buffer_adaptor v2rhotau_b_a_mem( base_stack.v2rhotau_b_a_eval_device, total_npts ); + buffer_adaptor v2rhotau_b_b_mem( base_stack.v2rhotau_b_b_eval_device, total_npts ); + buffer_adaptor v2gamma2_aa_aa_mem( base_stack.v2gamma2_aa_aa_eval_device, total_npts ); + buffer_adaptor v2gamma2_aa_ab_mem( base_stack.v2gamma2_aa_ab_eval_device, total_npts ); + buffer_adaptor v2gamma2_aa_bb_mem( base_stack.v2gamma2_aa_bb_eval_device, total_npts ); + buffer_adaptor v2gamma2_ab_ab_mem( base_stack.v2gamma2_ab_ab_eval_device, total_npts ); + buffer_adaptor v2gamma2_ab_bb_mem( base_stack.v2gamma2_ab_bb_eval_device, total_npts ); + buffer_adaptor v2gamma2_bb_bb_mem( base_stack.v2gamma2_bb_bb_eval_device, total_npts ); + buffer_adaptor v2gammalapl_aa_a_mem( base_stack.v2gammalapl_aa_a_eval_device, total_npts ); + buffer_adaptor v2gammalapl_aa_b_mem( base_stack.v2gammalapl_aa_b_eval_device, total_npts ); + buffer_adaptor v2gammalapl_ab_a_mem( base_stack.v2gammalapl_ab_a_eval_device, total_npts ); + buffer_adaptor v2gammalapl_ab_b_mem( base_stack.v2gammalapl_ab_b_eval_device, total_npts ); + buffer_adaptor v2gammalapl_bb_a_mem( base_stack.v2gammalapl_bb_a_eval_device, total_npts ); + buffer_adaptor v2gammalapl_bb_b_mem( base_stack.v2gammalapl_bb_b_eval_device, total_npts ); + buffer_adaptor v2gammatau_aa_a_mem( base_stack.v2gammatau_aa_a_eval_device, total_npts ); + buffer_adaptor v2gammatau_aa_b_mem( base_stack.v2gammatau_aa_b_eval_device, total_npts ); + buffer_adaptor v2gammatau_ab_a_mem( base_stack.v2gammatau_ab_a_eval_device, total_npts ); + buffer_adaptor v2gammatau_ab_b_mem( base_stack.v2gammatau_ab_b_eval_device, total_npts ); + buffer_adaptor v2gammatau_bb_a_mem( base_stack.v2gammatau_bb_a_eval_device, total_npts ); + buffer_adaptor v2gammatau_bb_b_mem( base_stack.v2gammatau_bb_b_eval_device, total_npts ); + buffer_adaptor v2lapl2_a_a_mem( base_stack.v2lapl2_a_a_eval_device, total_npts ); + buffer_adaptor v2lapl2_a_b_mem( base_stack.v2lapl2_a_b_eval_device, total_npts ); + buffer_adaptor v2lapl2_b_b_mem( base_stack.v2lapl2_b_b_eval_device, total_npts ); + buffer_adaptor v2lapltau_a_a_mem( base_stack.v2lapltau_a_a_eval_device, total_npts ); + buffer_adaptor v2lapltau_a_b_mem( base_stack.v2lapltau_a_b_eval_device, total_npts ); + buffer_adaptor v2lapltau_b_a_mem( base_stack.v2lapltau_b_a_eval_device, total_npts ); + buffer_adaptor v2lapltau_b_b_mem( base_stack.v2lapltau_b_b_eval_device, total_npts ); + buffer_adaptor v2tau2_a_a_mem( base_stack.v2tau2_a_a_eval_device, total_npts ); + buffer_adaptor v2tau2_a_b_mem( base_stack.v2tau2_a_b_eval_device, total_npts ); + buffer_adaptor v2tau2_b_b_mem( base_stack.v2tau2_b_b_eval_device, total_npts ); + + // Trial density gradient + buffer_adaptor tdden_sx_mem( base_stack.tdden_sx_eval_device, total_npts ); + buffer_adaptor tdden_sy_mem( base_stack.tdden_sy_eval_device, total_npts ); + buffer_adaptor tdden_sz_mem( base_stack.tdden_sz_eval_device, total_npts ); + buffer_adaptor tdden_zx_mem( base_stack.tdden_zx_eval_device, total_npts ); + buffer_adaptor tdden_zy_mem( base_stack.tdden_zy_eval_device, total_npts ); + buffer_adaptor tdden_zz_mem( base_stack.tdden_zz_eval_device, total_npts ); + buffer_adaptor tdden_yx_mem( base_stack.tdden_yx_eval_device, total_npts ); + buffer_adaptor tdden_yy_mem( base_stack.tdden_yy_eval_device, total_npts ); + buffer_adaptor tdden_yz_mem( base_stack.tdden_yz_eval_device, total_npts ); + buffer_adaptor tdden_xx_mem( base_stack.tdden_xx_eval_device, total_npts ); + buffer_adaptor tdden_xy_mem( base_stack.tdden_xy_eval_device, total_npts ); + buffer_adaptor tdden_xz_mem( base_stack.tdden_xz_eval_device, total_npts ); + + // Intermediate matrices for contraction + buffer_adaptor FXC_A_s_mem( base_stack.FXC_A_s_eval_device, total_npts); + buffer_adaptor FXC_Bx_s_mem( base_stack.FXC_Bx_s_eval_device, total_npts); + buffer_adaptor FXC_By_s_mem( base_stack.FXC_By_s_eval_device, total_npts); + buffer_adaptor FXC_Bz_s_mem( base_stack.FXC_Bz_s_eval_device, total_npts); + buffer_adaptor FXC_C_s_mem( base_stack.FXC_C_s_eval_device, total_npts); + buffer_adaptor FXC_A_z_mem( base_stack.FXC_A_z_eval_device, total_npts); + buffer_adaptor FXC_Bx_z_mem( base_stack.FXC_Bx_z_eval_device, total_npts); + buffer_adaptor FXC_By_z_mem( base_stack.FXC_By_z_eval_device, total_npts); + buffer_adaptor FXC_Bz_z_mem( base_stack.FXC_Bz_z_eval_device, total_npts); + buffer_adaptor FXC_C_z_mem( base_stack.FXC_C_z_eval_device, total_npts); for( auto& task : host_device_tasks ) { const auto npts = task.npts; @@ -594,6 +742,11 @@ void XCDeviceAoSData::pack_and_send( if( reqt.task_bfn_lapl ) { task.d2bflapl = d2bf_lapl_mem.aligned_alloc( nbe_bfn * npts, csl); } + if( reqt.task_bfn_lapgrad ) { + task.d3bflapl_x = d3bf_lapgrad_x_mem.aligned_alloc( nbe_bfn * npts, csl); + task.d3bflapl_y = d3bf_lapgrad_y_mem.aligned_alloc( nbe_bfn * npts, csl); + task.d3bflapl_z = d3bf_lapgrad_z_mem.aligned_alloc( nbe_bfn * npts, csl); + } // X Matrix gradient if( reqt.task_xmat_grad ) { @@ -602,12 +755,27 @@ void XCDeviceAoSData::pack_and_send( task.xmat_z = xmat_dz_mem.aligned_alloc( nbe_bfn * npts, csl); } + // Persistent X matrix + if( reqt.task_xmat_persist ) { + task.xmatS = xmatS_mem.aligned_alloc( nbe_bfn * npts, csl); + task.xmatZ = xmatZ_mem.aligned_alloc( nbe_bfn * npts, csl); + + if( reqt.task_xmat_grad ) { + task.xmatS_x = xmatS_dx_mem.aligned_alloc( nbe_bfn * npts, csl); + task.xmatS_y = xmatS_dy_mem.aligned_alloc( nbe_bfn * npts, csl); + task.xmatS_z = xmatS_dz_mem.aligned_alloc( nbe_bfn * npts, csl); + task.xmatZ_x = xmatZ_dx_mem.aligned_alloc( nbe_bfn * npts, csl); + task.xmatZ_y = xmatZ_dy_mem.aligned_alloc( nbe_bfn * npts, csl); + task.xmatZ_z = xmatZ_dz_mem.aligned_alloc( nbe_bfn * npts, csl); + } + } + // Grid function evaluations if (reqt.grid_den) { task.den_s = den_s_mem.aligned_alloc( npts, csl ); if(is_pol) { - task.den = den_mem.aligned_alloc(npts*2, csl); //Interleaved memory + task.den = den_interleaved_mem.aligned_alloc(npts*2, csl); //Interleaved memory task.den_z = den_z_mem.aligned_alloc( npts, csl); if ( is_gks ) { task.den_y = den_y_mem.aligned_alloc( npts, csl); @@ -616,6 +784,55 @@ void XCDeviceAoSData::pack_and_send( } } + if(reqt.grid_den_grad) { + task.dden_sx = dden_sx_mem.aligned_alloc(npts, csl); + task.dden_sy = dden_sy_mem.aligned_alloc(npts, csl); + task.dden_sz = dden_sz_mem.aligned_alloc(npts, csl); + if( is_pol ) { + task.dden_zx = dden_zx_mem.aligned_alloc( npts, csl ); + task.dden_zy = dden_zy_mem.aligned_alloc( npts, csl ); + task.dden_zz = dden_zz_mem.aligned_alloc( npts, csl ); + if( is_gks ) { + task.dden_yx = dden_yx_mem.aligned_alloc( npts, csl ); + task.dden_yy = dden_yy_mem.aligned_alloc( npts, csl ); + task.dden_yz = dden_yz_mem.aligned_alloc( npts, csl ); + task.dden_xx = dden_xx_mem.aligned_alloc( npts, csl ); + task.dden_xy = dden_xy_mem.aligned_alloc( npts, csl ); + task.dden_xz = dden_xz_mem.aligned_alloc( npts, csl ); + } + } + } + + if( reqt.grid_gamma ) { + task.gamma = gamma_mem.aligned_alloc( npts*gamma_fac, csl); + if( is_pol ) { + task.gamma_pp = gamma_pp_mem.aligned_alloc( npts, csl); + task.gamma_pm = gamma_pm_mem.aligned_alloc( npts, csl); + task.gamma_mm = gamma_mm_mem.aligned_alloc( npts, csl); + } + } + + if (reqt.grid_tau) { + task.tau_s = tau_s_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.tau = tau_interleaved_mem.aligned_alloc(npts*2, csl); //Interleaved memory + task.tau_z = tau_z_mem.aligned_alloc( npts, csl); + } + } + + if (reqt.grid_lapl) { + task.lapl_s = lapl_s_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.lapl = lapl_interleaved_mem.aligned_alloc(npts*2, csl); //Interleaved memory + task.lapl_z = lapl_z_mem.aligned_alloc( npts, csl); + } + } + + + + if(reqt.grid_eps) + task.eps = eps_mem.aligned_alloc( reqt.grid_eps_size(npts), csl); + if( reqt.grid_vrho ) { task.vrho = vrho_mem.aligned_alloc( npts*den_fac, csl); if( is_pol ) { @@ -632,33 +849,23 @@ void XCDeviceAoSData::pack_and_send( task.vgamma_mm = vgamma_mm_mem.aligned_alloc( npts, csl); } } - if( reqt.grid_gamma ) { - task.gamma = gamma_mem.aligned_alloc( npts*gamma_fac, csl); + + if( reqt.grid_vtau ) { + task.vtau = vtau_mem.aligned_alloc( npts*den_fac, csl); if( is_pol ) { - task.gamma_pp = gamma_pp_mem.aligned_alloc( npts, csl); - task.gamma_pm = gamma_pm_mem.aligned_alloc( npts, csl); - task.gamma_mm = gamma_mm_mem.aligned_alloc( npts, csl); + task.vtau_pos = vtau_pos_mem.aligned_alloc( npts, csl); + task.vtau_neg = vtau_neg_mem.aligned_alloc( npts, csl); } } - if(reqt.grid_den_grad) { - task.dden_sx = dden_sx_mem.aligned_alloc(npts, csl); - task.dden_sy = dden_sy_mem.aligned_alloc(npts, csl); - task.dden_sz = dden_sz_mem.aligned_alloc(npts, csl); + if( reqt.grid_vlapl ) { + task.vlapl = vlapl_mem.aligned_alloc( npts*den_fac, csl); if( is_pol ) { - task.dden_zx = dden_zx_mem.aligned_alloc( npts, csl ); - task.dden_zy = dden_zy_mem.aligned_alloc( npts, csl ); - task.dden_zz = dden_zz_mem.aligned_alloc( npts, csl ); - if( is_gks ) { - task.dden_yx = dden_yx_mem.aligned_alloc( npts, csl ); - task.dden_yy = dden_yy_mem.aligned_alloc( npts, csl ); - task.dden_yz = dden_yz_mem.aligned_alloc( npts, csl ); - task.dden_xx = dden_xx_mem.aligned_alloc( npts, csl ); - task.dden_xy = dden_xy_mem.aligned_alloc( npts, csl ); - task.dden_xz = dden_xz_mem.aligned_alloc( npts, csl ); - } + task.vlapl_pos = vlapl_pos_mem.aligned_alloc( npts, csl); + task.vlapl_neg = vlapl_neg_mem.aligned_alloc( npts, csl); } } + // H, K terms (GKS) if( is_gks ) { @@ -671,21 +878,6 @@ void XCDeviceAoSData::pack_and_send( task.H_z = H_z_mem.aligned_alloc( npts, csl ); } } - - task.eps = eps_mem.aligned_alloc( reqt.grid_eps_size(npts), csl); - - - if(reqt.grid_den_lapl) { - task.denlapl = dden_lapl_mem.aligned_alloc(npts, csl); - } - - task.tau = - tau_mem.aligned_alloc( reqt.grid_tau_size(npts), csl); - - task.vtau = - vtau_mem.aligned_alloc( reqt.grid_vtau_size(npts), csl); - task.vlapl = - vlapl_mem.aligned_alloc( reqt.grid_vlapl_size(npts), csl); // EXX Specific task.fmat = fmat_mem.aligned_alloc( @@ -699,6 +891,185 @@ void XCDeviceAoSData::pack_and_send( reqt.task_bfn_shell_indirection_size(nbe_bfn), csl ); + // Second derivative + if( terms.fxc_contraction ) { + // Trial density + if(reqt.grid_tden) { + task.tden_s = tden_s_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.tden_z = tden_z_mem.aligned_alloc( npts, csl ); + if(is_gks) { + task.tden_y = tden_y_mem.aligned_alloc( npts, csl ); + task.tden_x = tden_x_mem.aligned_alloc( npts, csl ); + } + } + } + + if(reqt.grid_tden_grad) { + task.tdden_sx = tdden_sx_mem.aligned_alloc( npts, csl ); + task.tdden_sy = tdden_sy_mem.aligned_alloc( npts, csl ); + task.tdden_sz = tdden_sz_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.tdden_zx = tdden_zx_mem.aligned_alloc( npts, csl ); + task.tdden_zy = tdden_zy_mem.aligned_alloc( npts, csl ); + task.tdden_zz = tdden_zz_mem.aligned_alloc( npts, csl ); + if(is_gks) { + task.tdden_yx = tdden_yx_mem.aligned_alloc( npts, csl ); + task.tdden_yy = tdden_yy_mem.aligned_alloc( npts, csl ); + task.tdden_yz = tdden_yz_mem.aligned_alloc( npts, csl ); + task.tdden_xx = tdden_xx_mem.aligned_alloc( npts, csl ); + task.tdden_xy = tdden_xy_mem.aligned_alloc( npts, csl ); + task.tdden_xz = tdden_xz_mem.aligned_alloc( npts, csl ); + } + } + } + + + if(reqt.grid_ttau) { + task.ttau_s = ttau_s_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.ttau_z = ttau_z_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_tlapl) { + task.tlapl_s = tlapl_s_mem.aligned_alloc( npts, csl ); + if(is_pol) { + task.tlapl_z = tlapl_z_mem.aligned_alloc( npts, csl ); + } + } + + // Second derivatives of XC functional + if(reqt.grid_v2rho2) { + task.v2rho2 = v2rho2_mem.aligned_alloc( npts*rhorho_fac, csl ); + if(is_pol) { + task.v2rho2_a_a = v2rho2_a_a_mem.aligned_alloc( npts, csl ); + task.v2rho2_a_b = v2rho2_a_b_mem.aligned_alloc( npts, csl ); + task.v2rho2_b_b = v2rho2_b_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2rhogamma) { + task.v2rhogamma = v2rhogamma_mem.aligned_alloc( npts*rhogamma_fac, csl ); + if(is_pol) { + task.v2rhogamma_a_aa = v2rhogamma_a_aa_mem.aligned_alloc( npts, csl ); + task.v2rhogamma_a_ab = v2rhogamma_a_ab_mem.aligned_alloc( npts, csl ); + task.v2rhogamma_a_bb = v2rhogamma_a_bb_mem.aligned_alloc( npts, csl ); + task.v2rhogamma_b_aa = v2rhogamma_b_aa_mem.aligned_alloc( npts, csl ); + task.v2rhogamma_b_ab = v2rhogamma_b_ab_mem.aligned_alloc( npts, csl ); + task.v2rhogamma_b_bb = v2rhogamma_b_bb_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2rholapl) { + task.v2rholapl = v2rholapl_mem.aligned_alloc( npts*rhotau_fac, csl ); + if(is_pol) { + task.v2rholapl_a_a = v2rholapl_a_a_mem.aligned_alloc( npts, csl ); + task.v2rholapl_a_b = v2rholapl_a_b_mem.aligned_alloc( npts, csl ); + task.v2rholapl_b_a = v2rholapl_b_a_mem.aligned_alloc( npts, csl ); + task.v2rholapl_b_b = v2rholapl_b_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2rhotau) { + task.v2rhotau = v2rhotau_mem.aligned_alloc( npts*rhotau_fac, csl ); + if(is_pol) { + task.v2rhotau_a_a = v2rhotau_a_a_mem.aligned_alloc( npts, csl ); + task.v2rhotau_a_b = v2rhotau_a_b_mem.aligned_alloc( npts, csl ); + task.v2rhotau_b_a = v2rhotau_b_a_mem.aligned_alloc( npts, csl ); + task.v2rhotau_b_b = v2rhotau_b_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2gamma2) { + task.v2gamma2 = v2gamma2_mem.aligned_alloc( npts*rhogamma_fac, csl ); + if(is_pol) { + task.v2gamma2_aa_aa = v2gamma2_aa_aa_mem.aligned_alloc( npts, csl ); + task.v2gamma2_aa_ab = v2gamma2_aa_ab_mem.aligned_alloc( npts, csl ); + task.v2gamma2_aa_bb = v2gamma2_aa_bb_mem.aligned_alloc( npts, csl ); + task.v2gamma2_ab_ab = v2gamma2_ab_ab_mem.aligned_alloc( npts, csl ); + task.v2gamma2_ab_bb = v2gamma2_ab_bb_mem.aligned_alloc( npts, csl ); + task.v2gamma2_bb_bb = v2gamma2_bb_bb_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2gammalapl) { + task.v2gammalapl = v2gammalapl_mem.aligned_alloc( npts*rhogamma_fac, csl ); + if(is_pol) { + task.v2gammalapl_aa_a = v2gammalapl_aa_a_mem.aligned_alloc( npts, csl ); + task.v2gammalapl_aa_b = v2gammalapl_aa_b_mem.aligned_alloc( npts, csl ); + task.v2gammalapl_ab_a = v2gammalapl_ab_a_mem.aligned_alloc( npts, csl ); + task.v2gammalapl_ab_b = v2gammalapl_ab_b_mem.aligned_alloc( npts, csl ); + task.v2gammalapl_bb_a = v2gammalapl_bb_a_mem.aligned_alloc( npts, csl ); + task.v2gammalapl_bb_b = v2gammalapl_bb_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2gammatau) { + task.v2gammatau = v2gammatau_mem.aligned_alloc( npts*rhogamma_fac, csl ); + if(is_pol) { + task.v2gammatau_aa_a = v2gammatau_aa_a_mem.aligned_alloc( npts, csl ); + task.v2gammatau_aa_b = v2gammatau_aa_b_mem.aligned_alloc( npts, csl ); + task.v2gammatau_ab_a = v2gammatau_ab_a_mem.aligned_alloc( npts, csl ); + task.v2gammatau_ab_b = v2gammatau_ab_b_mem.aligned_alloc( npts, csl ); + task.v2gammatau_bb_a = v2gammatau_bb_a_mem.aligned_alloc( npts, csl ); + task.v2gammatau_bb_b = v2gammatau_bb_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2lapl2) { + task.v2lapl2 = v2lapl2_mem.aligned_alloc( npts*rhorho_fac, csl ); + if(is_pol) { + task.v2lapl2_a_a = v2lapl2_a_a_mem.aligned_alloc( npts, csl ); + task.v2lapl2_a_b = v2lapl2_a_b_mem.aligned_alloc( npts, csl ); + task.v2lapl2_b_b = v2lapl2_b_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2lapltau) { + task.v2lapltau = v2lapltau_mem.aligned_alloc( npts*rhotau_fac, csl ); + if(is_pol) { + task.v2lapltau_a_a = v2lapltau_a_a_mem.aligned_alloc( npts, csl ); + task.v2lapltau_a_b = v2lapltau_a_b_mem.aligned_alloc( npts, csl ); + task.v2lapltau_b_a = v2lapltau_b_a_mem.aligned_alloc( npts, csl ); + task.v2lapltau_b_b = v2lapltau_b_b_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_v2tau2) { + task.v2tau2 = v2tau2_mem.aligned_alloc( npts*rhorho_fac, csl ); + if(is_pol) { + task.v2tau2_a_a = v2tau2_a_a_mem.aligned_alloc( npts, csl ); + task.v2tau2_a_b = v2tau2_a_b_mem.aligned_alloc( npts, csl ); + task.v2tau2_b_b = v2tau2_b_b_mem.aligned_alloc( npts, csl ); + } + } + + // Intermediate matrices for contraction + if(reqt.grid_FXC_A) { + task.FXC_A_s = FXC_A_s_mem.aligned_alloc( npts, csl ); + if (is_pol) + task.FXC_A_z = FXC_A_z_mem.aligned_alloc( npts, csl ); + } + + if(reqt.grid_FXC_B) { + task.FXC_Bx_s = FXC_Bx_s_mem.aligned_alloc( npts, csl ); + task.FXC_By_s = FXC_By_s_mem.aligned_alloc( npts, csl ); + task.FXC_Bz_s = FXC_Bz_s_mem.aligned_alloc( npts, csl ); + if (is_pol) { + task.FXC_Bx_z = FXC_Bx_z_mem.aligned_alloc( npts, csl ); + task.FXC_By_z = FXC_By_z_mem.aligned_alloc( npts, csl ); + task.FXC_Bz_z = FXC_Bz_z_mem.aligned_alloc( npts, csl ); + } + } + + if(reqt.grid_FXC_C) { + task.FXC_C_s = FXC_C_s_mem.aligned_alloc( npts, csl ); + if (is_pol) + task.FXC_C_z = FXC_C_z_mem.aligned_alloc( npts, csl ); + } + } + } // Loop over device tasks } // Setup indirection diff --git a/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp b/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp index db399d07..d1c3b782 100644 --- a/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_aos_data.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -41,6 +45,9 @@ struct XCDeviceAoSData : public XCDeviceStackData { double* d2bf_zz_eval_device = nullptr; ///< 2nd Derivative of `bf_eval_device` wrt z+z double* d2bf_lapl_eval_device = nullptr; ///< Laplacian of `bf_eval_device` + double* d3bf_lapgrad_x_eval_device = nullptr; ///< Laplacian derivative of bf_eval_device wrt x + double* d3bf_lapgrad_y_eval_device = nullptr; ///< Laplacian derivative of bf_eval_device wrt y + double* d3bf_lapgrad_z_eval_device = nullptr; ///< Laplacian derivative of bf_eval_device wrt z // VXC Z Matrix double* zmat_vxc_device = nullptr; @@ -51,6 +58,16 @@ struct XCDeviceAoSData : public XCDeviceStackData { double* xmat_dy_device = nullptr; double* xmat_dz_device = nullptr; + // Persistent X mat + double* xmatS_device = nullptr; + double* xmatS_dx_device = nullptr; + double* xmatS_dy_device = nullptr; + double* xmatS_dz_device = nullptr; + double* xmatZ_device = nullptr; + double* xmatZ_dx_device = nullptr; + double* xmatZ_dy_device = nullptr; + double* xmatZ_dz_device = nullptr; + // EXX Intermediates double* fmat_exx_device = nullptr; double* gmat_exx_device = nullptr; diff --git a/src/xc_integrator/xc_data/device/xc_device_data.hpp b/src/xc_integrator/xc_data/device/xc_device_data.hpp index fde8158b..781e2372 100644 --- a/src/xc_integrator/xc_data/device/xc_device_data.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_data.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -49,6 +53,7 @@ struct integrator_term_tracker { bool exc_grad = false; bool exx = false; bool exx_ek_screening = false; + bool fxc_contraction = false; integrator_xc_approx xc_approx = _UNDEF_APPROX; integrator_ks_scheme ks_scheme = _UNDEF_SCHEME; inline void reset() { @@ -72,7 +77,7 @@ struct required_term_storage { // Evaluation of functions on the grid (linear storage) bool grid_den = false; bool grid_den_grad = false; - bool grid_den_lapl = false; + bool grid_lapl = false; bool grid_gamma = false; bool grid_tau = false; bool grid_eps = false; @@ -80,6 +85,25 @@ struct required_term_storage { bool grid_vgamma = false; bool grid_vtau = false; bool grid_vlapl = false; + + // Second derivative variables + bool grid_tden = false; + bool grid_tden_grad = false; + bool grid_ttau = false; + bool grid_tlapl = false; + bool grid_v2rho2 = false; + bool grid_v2rhogamma = false; + bool grid_v2rholapl = false; + bool grid_v2rhotau = false; + bool grid_v2gamma2 = false; + bool grid_v2gammalapl = false; + bool grid_v2gammatau = false; + bool grid_v2lapl2 = false; + bool grid_v2lapltau = false; + bool grid_v2tau2 = false; + bool grid_FXC_A = false; + bool grid_FXC_B = false; + bool grid_FXC_C = false; // Reference flags for memory management use @@ -114,11 +138,29 @@ struct required_term_storage { } return 0ul; } - inline size_t grid_den_lapl_size(size_t npts){ - return PRDVL(grid_den_lapl, npts); + inline size_t grid_lapl_size(size_t npts){ + if(grid_lapl) { + switch(ref_tracker.ks_scheme) { + case UKS: + case GKS: + return 4 * npts; + default: + return npts; + } + } + return 0ul; } inline size_t grid_tau_size(size_t npts){ - return PRDVL(grid_tau, npts); + if(grid_tau) { + switch(ref_tracker.ks_scheme) { + case UKS: + case GKS: + return 4 * npts; + default: + return npts; + } + } + return 0ul; } inline size_t grid_eps_size(size_t npts){ return PRDVL(grid_eps, npts); @@ -147,10 +189,175 @@ struct required_term_storage { return 0ul; } inline size_t grid_vtau_size(size_t npts){ - return PRDVL(grid_vtau, npts); + if(grid_vtau) { + switch(ref_tracker.ks_scheme) { + case UKS: + case GKS: + return 4 * npts; + default: + return npts; + } + } + return 0ul; } inline size_t grid_vlapl_size(size_t npts){ - return PRDVL(grid_vlapl, npts); + if(grid_vlapl) { + switch(ref_tracker.ks_scheme) { + case UKS: + case GKS: + return 4 * npts; + default: + return npts; + } + } + return 0ul; + } + + // Size calculators for second derivative variables + inline size_t grid_tden_size(size_t npts){ + if( grid_tden ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + // 2*npts for S,Z densities, 2*npts for interleaved density + if( ref_tracker.ks_scheme == UKS ) return 2*npts; + // Same as above, but also X,Y densities + if( ref_tracker.ks_scheme == GKS ) return 4*npts; + } + return 0ul; + } + + inline size_t grid_tden_grad_size(size_t npts){ + if( grid_tden_grad ) { + // 3*npts for each density in play + if( ref_tracker.ks_scheme == RKS ) return 3*npts; + if( ref_tracker.ks_scheme == UKS ) return 6*npts; + if( ref_tracker.ks_scheme == GKS ) return 12*npts; + } + return 0ul; + } + + inline size_t grid_tlapl_size(size_t npts){ + if(grid_tlapl) { + switch(ref_tracker.ks_scheme) { + case UKS: + case GKS: + return 2 * npts; + default: + return npts; + } + } + return 0ul; + } + + inline size_t grid_ttau_size(size_t npts){ + if(grid_ttau) { + switch(ref_tracker.ks_scheme) { + case UKS: + case GKS: + return 2 * npts; + default: + return npts; + } + } + return 0ul; + } + + inline size_t grid_v2rho2_size(size_t npts){ + if(grid_v2rho2) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts; + } + return 0ul; + } + + inline size_t grid_v2rhogamma_size(size_t npts){ + if(grid_v2rhogamma) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts; + } + return 0ul; + } + + inline size_t grid_v2rholapl_size(size_t npts){ + if(grid_v2rholapl) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 8*npts; + } + return 0ul; + } + + inline size_t grid_v2rhotau_size(size_t npts){ + if(grid_v2rhotau) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 8*npts; + } + return 0ul; + } + + inline size_t grid_v2gamma2_size(size_t npts){ + if(grid_v2gamma2) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts; + } + return 0ul; + } + + inline size_t grid_v2gammalapl_size(size_t npts){ + if(grid_v2gammalapl) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts; + } + return 0ul; + } + + inline size_t grid_v2gammatau_size(size_t npts){ + if(grid_v2gammatau) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 12*npts; + } + return 0ul; + } + + inline size_t grid_v2lapl2_size(size_t npts){ + if(grid_v2lapl2) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts; + } + return 0ul; + } + + inline size_t grid_v2lapltau_size(size_t npts){ + if(grid_v2lapltau) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 8*npts; + } + return 0ul; + } + + inline size_t grid_v2tau2_size(size_t npts){ + if(grid_v2tau2) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts; + } + return 0ul; + } + + inline size_t grid_FXC_A_size(size_t npts){ + if( grid_FXC_A ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 2*npts; + } + } + inline size_t grid_FXC_B_size(size_t npts){ + if( grid_FXC_B ) { + if( ref_tracker.ks_scheme == RKS ) return 3*npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 6*npts; + } + } + inline size_t grid_FXC_C_size(size_t npts){ + if( grid_FXC_C ) { + if( ref_tracker.ks_scheme == RKS ) return npts; + if( ref_tracker.ks_scheme == UKS or ref_tracker.ks_scheme == GKS ) return 2*npts; + } } @@ -160,9 +367,11 @@ struct required_term_storage { bool task_bfn_grad = false; bool task_bfn_hess = false; bool task_bfn_lapl = false; + bool task_bfn_lapgrad = false; bool task_zmat = false; bool task_xmat = false; bool task_xmat_grad = false; + bool task_xmat_persist = false; bool task_fmat = false; bool task_gmat = false; bool task_nbe_scr = false; @@ -181,12 +390,19 @@ struct required_term_storage { inline size_t task_bfn_lapl_size(size_t nbe, size_t npts) { return PRDVL(task_bfn_lapl, nbe * npts); } + inline size_t task_bfn_lapgrad_size(size_t nbe, size_t npts) { + return PRDVL(task_bfn_lapgrad, 3 * nbe * npts); + } inline size_t task_zmat_size(size_t nbe, size_t npts) { return PRDVL(task_zmat, nbe * npts); } inline size_t task_xmat_grad_size(size_t nbe, size_t npts) { return PRDVL(task_xmat_grad, 3 * nbe * npts); } + inline size_t task_xmat_persist_size(size_t nbe, size_t npts) { + // TODO Make this more robust + return PRDVL(task_xmat_persist, 2 * (task_xmat_grad ? 4 : 1) * nbe * npts); + } inline size_t task_fmat_size(size_t nbe, size_t npts) { return PRDVL(task_fmat, nbe * npts); } @@ -305,7 +521,8 @@ struct required_term_storage { } // Allocated terms for XC calculations - const bool is_xc = tracker.exc_vxc or tracker.exc_grad; + const bool is_xc = tracker.exc_vxc or tracker.exc_grad or tracker.fxc_contraction; + const bool is_2nd_deriv = tracker.fxc_contraction; ref_tracker = tracker; @@ -320,10 +537,11 @@ struct required_term_storage { const bool need_lapl = tracker.xc_approx == MGGA_LAPL; const bool is_mgga = is_xc and (need_tau or need_lapl); const bool is_grad = tracker.exc_grad; + const bool is_rks = tracker.ks_scheme == RKS; grid_den = true; grid_den_grad = is_gga or is_mgga or is_grad; - grid_den_lapl = need_lapl; + grid_lapl = need_lapl; grid_gamma = is_gga or is_mgga; grid_tau = is_mgga; grid_eps = true; @@ -334,11 +552,13 @@ struct required_term_storage { task_bfn = true; task_bfn_grad = is_gga or is_mgga or is_grad; - task_bfn_hess = is_gga and is_grad; + task_bfn_hess = (is_gga or is_mgga) and is_grad; task_bfn_lapl = need_lapl; + task_bfn_lapgrad = need_lapl and is_grad; task_zmat = true; task_xmat = true; task_xmat_grad = is_mgga or (is_gga and is_grad); + task_xmat_persist = is_grad and not is_rks; task_nbe_scr = true; task_submat_cut_bfn = true; @@ -350,6 +570,31 @@ struct required_term_storage { shell_to_task_bfn = true; } + if(is_2nd_deriv) { + grid_eps = false; + + grid_tden = true; + grid_tden_grad = true; + grid_tlapl = true; + grid_ttau = true; + grid_v2rho2 = true; + grid_v2rhogamma= true; + grid_v2rholapl = true; + grid_v2rhotau = true; + grid_v2gamma2 = true; + grid_v2gammalapl= true; + grid_v2gammatau= true; + grid_v2lapl2 = true; + grid_v2lapltau = true; + grid_v2tau2 = true; + grid_FXC_A = true; + grid_FXC_B = true; + grid_FXC_C = true; + + // task_bfn_hess = is_gga or is_mgga or is_grad; // TODO: Check this + // task_bfn_lapgrad = need_lapl and is_grad; // TODO: Check this + } + // Density integration if(tracker.den) { grid_den = true; @@ -409,6 +654,7 @@ std::ostream& operator<<( std::ostream& out, const integrator_term_tracker& t ) out << " WEIGHTS " << t.weights << std::endl; out << " DEN " << t.den << std::endl; out << " EXC_VXC " << t.exc_vxc << std::endl; + out << " FXC_CONTRACTION " << t.fxc_contraction << std::endl; out << " EXC_GRAD " << t.exc_grad << std::endl; out << " EXX " << t.exx << std::endl; return out; @@ -432,13 +678,19 @@ struct XCDeviceData { virtual void allocate_static_data_weights( int32_t natoms ) = 0; virtual void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) = 0; virtual void allocate_static_data_den( int32_t nbf, int32_t nshells ) = 0; - virtual void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) = 0; + virtual void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms, integrator_term_tracker enabled_terms ) = 0; virtual void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) = 0; virtual void allocate_static_data_exx_ek_screening( size_t ntasks, int32_t nbf, int32_t nshells, int nshell_pairs, int32_t max_l ) = 0; + virtual void allocate_static_data_fxc_contraction( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms) = 0; // Send persistent data from host to device virtual void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) = 0; - virtual void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet& basis ) = 0; + virtual void send_static_data_density_basis( const double* Ps, int32_t ldps, + const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, + const double* Px, int32_t ldpx, const BasisSet& basis ) = 0; + virtual void send_static_data_trial_density( + const double* tPs, int32_t ldtps, const double* tPz, int32_t ldtpz, + const double* tPy, int32_t ldtpy, const double* tPx, int32_t ldtpx ) = 0; virtual void send_static_data_shell_pairs( const BasisSet&, const ShellPairCollection& ) = 0; virtual void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection& ) = 0; @@ -457,6 +709,9 @@ struct XCDeviceData { /// Zero out intermediates for EXX EK screening virtual void zero_exx_ek_screening_intermediates() = 0; + /// Zero out the FXC contraction integrands in device memory + virtual void zero_fxc_contraction_integrands() = 0; + /** Generate task batch to execute on device * * Generate a batch of XC tasks to execute on the device and @@ -487,6 +742,10 @@ struct XCDeviceData { double* VXCs, int32_t ldvxcs, double* VXCz, int32_t ldvxcz, double* VXCy, int32_t ldvxcy, double* VXCx, int32_t ldvxcx ) = 0; + virtual void retrieve_fxc_contraction_integrands( double* N_EL, + double* FXCs, int32_t ldfxcs, double* FXCz, int32_t ldfxcz, + double* FXCy, int32_t ldfxcy, double* FXCx, int32_t ldfxcx ) = 0; + /** Retreive EXC Gradient integrands from device memory * * @param[out] EXC_GRAD Integrated XC Gradient (host) for XC task @@ -516,6 +775,10 @@ struct XCDeviceData { virtual double* exc_device_data() = 0; virtual double* nel_device_data() = 0; virtual double* exx_k_device_data() = 0; + virtual double* fxc_z_device_data() = 0; + virtual double* fxc_s_device_data() = 0; + virtual double* fxc_y_device_data() = 0; + virtual double* fxc_x_device_data() = 0; virtual device_queue queue() = 0; diff --git a/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp b/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp index 76119933..3b979c8f 100644 --- a/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_shell_pair_soa.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx index 1aadd1ab..96ffb888 100644 --- a/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx +++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -41,6 +45,10 @@ double* XCDeviceStackData::vxc_x_device_data() { return static_stack.vxc_x_devic double* XCDeviceStackData::exc_device_data() { return static_stack.exc_device; } double* XCDeviceStackData::nel_device_data() { return static_stack.nel_device; } double* XCDeviceStackData::exx_k_device_data() { return static_stack.exx_k_device; } +double* XCDeviceStackData::fxc_s_device_data() { return static_stack.fxc_s_device; } +double* XCDeviceStackData::fxc_z_device_data() { return static_stack.fxc_z_device; } +double* XCDeviceStackData::fxc_y_device_data() { return static_stack.fxc_y_device; } +double* XCDeviceStackData::fxc_x_device_data() { return static_stack.fxc_x_device; } device_queue XCDeviceStackData::queue() { if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); @@ -130,6 +138,51 @@ void XCDeviceStackData::allocate_static_data_exc_vxc( int32_t nbf, int32_t nshel allocated_terms.exc_vxc = true; } + +void XCDeviceStackData::allocate_static_data_fxc_contraction( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms ) { + + if( allocated_terms.fxc_contraction ) + GAUXC_GENERIC_EXCEPTION("Attempting to reallocate Stack FXC Contraction"); + if( enabled_terms.ks_scheme == _UNDEF_SCHEME ) + GAUXC_GENERIC_EXCEPTION("Must have a KS Scheme set to allocate Stack EXC VXC"); + + // Save state + global_dims.nshells = nshells; + global_dims.nbf = nbf; + + // Allocate static memory with proper alignment + buffer_adaptor mem( dynmem_ptr, dynmem_sz ); + + static_stack.shells_device = mem.aligned_alloc>( nshells , csl); + static_stack.nel_device = mem.aligned_alloc( 1 , csl); + static_stack.acc_scr_device = mem.aligned_alloc( 1 , csl); + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.tdmat_s_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.fxc_s_device = mem.aligned_alloc( nbf * nbf , csl ); + + allocated_terms.ks_scheme = enabled_terms.ks_scheme; + if( not (allocated_terms.ks_scheme == RKS) ) { + static_stack.dmat_z_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.tdmat_z_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.fxc_z_device = mem.aligned_alloc( nbf * nbf , csl ); + if( allocated_terms.ks_scheme == GKS ) { + static_stack.dmat_y_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.dmat_x_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.tdmat_y_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.tdmat_x_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.fxc_y_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.fxc_x_device = mem.aligned_alloc( nbf * nbf , csl ); + } + } + + // Get current stack location + dynmem_ptr = mem.stack(); + dynmem_sz = mem.nleft(); + + + allocated_terms.fxc_contraction = true; +} + void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells ) { if( allocated_terms.den ) @@ -155,7 +208,7 @@ void XCDeviceStackData::allocate_static_data_den( int32_t nbf, int32_t nshells ) allocated_terms.den = true; } -void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) { +void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms, integrator_term_tracker enabled_terms ) { if( allocated_terms.exc_grad ) GAUXC_GENERIC_EXCEPTION("Attempting to reallocate Stack EXC GRAD"); @@ -173,7 +226,15 @@ void XCDeviceStackData::allocate_static_data_exc_grad( int32_t nbf, int32_t nshe static_stack.nel_device = mem.aligned_alloc( 1 , csl); static_stack.acc_scr_device = mem.aligned_alloc( 1 , csl); - static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl); + allocated_terms.ks_scheme = enabled_terms.ks_scheme; + static_stack.dmat_s_device = mem.aligned_alloc( nbf * nbf , csl ); + if( not (allocated_terms.ks_scheme == RKS) ) { + static_stack.dmat_z_device = mem.aligned_alloc( nbf * nbf , csl ); + if( allocated_terms.ks_scheme == GKS ) { + static_stack.dmat_y_device = mem.aligned_alloc( nbf * nbf , csl ); + static_stack.dmat_x_device = mem.aligned_alloc( nbf * nbf , csl ); + } + } // Get current stack location dynmem_ptr = mem.stack(); @@ -294,7 +355,7 @@ void XCDeviceStackData::send_static_data_density_basis( const double* Ps, int32_ if( not is_rks and not is_uks and not is_gks ) GAUXC_GENERIC_EXCEPTION("Densities do not match RKS, UKS, or GKS schemes"); - if( not (allocated_terms.exx or allocated_terms.exc_vxc or allocated_terms.exc_grad or allocated_terms.den or allocated_terms.exx_ek_screening) ) + if( not (allocated_terms.exx or allocated_terms.exc_vxc or allocated_terms.exc_grad or allocated_terms.den or allocated_terms.exx_ek_screening or allocated_terms.fxc_contraction ) ) GAUXC_GENERIC_EXCEPTION("Density/Basis Not Stack Allocated"); if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); @@ -323,6 +384,40 @@ void XCDeviceStackData::send_static_data_density_basis( const double* Ps, int32_ } +void XCDeviceStackData::send_static_data_trial_density( + const double* tPs, int32_t ldtps, const double* tPz, int32_t ldtpz, + const double* tPy, int32_t ldtpy, const double* tPx, int32_t ldtpx ) { + + const bool is_gks = (tPz != nullptr) && (tPy != nullptr) && (tPx != nullptr); + const bool is_uks = (tPz != nullptr) && (tPy == nullptr) && (tPx == nullptr); + const bool is_rks = (tPs != nullptr) && (not is_uks and not is_gks); + if( not is_rks and not is_uks and not is_gks ) + GAUXC_GENERIC_EXCEPTION("Trial densities do not match RKS, UKS, or GKS schemes"); + + if( not allocated_terms.fxc_contraction ) + GAUXC_GENERIC_EXCEPTION("Trial Density Not Stack Allocated"); + + if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); + + const auto nbf = global_dims.nbf; + // Check dimensions and copy density + if( ldtps != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTps must bf NBF"); + device_backend_->copy_async( nbf*nbf, tPs, static_stack.tdmat_s_device, "tP_scalar H2D" ); + if( not is_rks ) { + if( ldtpz != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTpz must bf NBF"); + device_backend_->copy_async( nbf*nbf, tPz, static_stack.tdmat_z_device, "tP_z H2D" ); + if( is_gks ) { + if( ldtpy != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTpy must bf NBF"); + if( ldtpx != (int)nbf ) GAUXC_GENERIC_EXCEPTION("LDTpx must bf NBF"); + device_backend_->copy_async( nbf*nbf, tPy, static_stack.tdmat_y_device, "tP_y H2D" ); + device_backend_->copy_async( nbf*nbf, tPx, static_stack.tdmat_x_device, "tP_x H2D" ); + } + } + + device_backend_->master_queue_synchronize(); +} + + void XCDeviceStackData::send_static_data_shell_pairs( const BasisSet& basis, const ShellPairCollection& shell_pairs ) { @@ -475,6 +570,19 @@ void XCDeviceStackData::zero_exc_vxc_integrands(integrator_term_tracker enabled_ } +void XCDeviceStackData::zero_fxc_contraction_integrands() { + + if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); + + const auto nbf = global_dims.nbf; + if(static_stack.fxc_s_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_s_device, "FXCs Zero" ); + if(static_stack.fxc_z_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_z_device, "FXCz Zero" ); + if(static_stack.fxc_y_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_y_device, "FXCy Zero" ); + if(static_stack.fxc_x_device) device_backend_->set_zero( nbf*nbf, static_stack.fxc_x_device, "FXCx Zero" ); + device_backend_->set_zero( 1, static_stack.nel_device, "NEL Zero" ); + +} + void XCDeviceStackData::zero_exc_grad_integrands() { if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); @@ -533,6 +641,31 @@ void XCDeviceStackData::retrieve_exc_vxc_integrands( double* EXC, double* N_EL, } +void XCDeviceStackData::retrieve_fxc_contraction_integrands( double* N_EL, + double* FXCs, int32_t ldfxcs, double* FXCz, int32_t ldfxcz, + double* FXCy, int32_t ldfxcy, double* FXCx, int32_t ldfxcx ) { + + const auto nbf = global_dims.nbf; + device_backend_->copy_async( 1, static_stack.nel_device, N_EL, "NEL D2H" ); + + if( ldfxcs and (ldfxcs != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCs must be NBF"); + if( FXCs ) + device_backend_->copy_async( nbf*nbf, static_stack.fxc_s_device, FXCs, "FXCs D2H" ); + + if( ldfxcz and (ldfxcz != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCz must be NBF"); + if( FXCz ) + device_backend_->copy_async( nbf*nbf, static_stack.fxc_z_device, FXCz, "FXCz D2H" ); + + if( ldfxcy and (ldfxcy != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCy must be NBF"); + if( FXCy ) + device_backend_->copy_async( nbf*nbf, static_stack.fxc_y_device, FXCy, "FXCy D2H" ); + + if( ldfxcx and (ldfxcx != (int)nbf) ) GAUXC_GENERIC_EXCEPTION("LDFXCx must be NBF"); + if( FXCx ) + device_backend_->copy_async( nbf*nbf, static_stack.fxc_x_device, FXCx, "FXCx D2H" ); + +} + void XCDeviceStackData::retrieve_den_integrands( double* N_EL ) { if( not device_backend_ ) GAUXC_GENERIC_EXCEPTION("Invalid Device Backend"); @@ -639,7 +772,7 @@ size_t XCDeviceStackData::get_mem_req( // U Variables reqt.grid_den_size(npts) * sizeof(double) + reqt.grid_den_grad_size(npts) * sizeof(double) + - reqt.grid_den_lapl_size(npts) * sizeof(double) + + reqt.grid_lapl_size(npts) * sizeof(double) + // H/K Matrices (GKS) reqt.grid_HK_size(npts) * sizeof(double) + @@ -655,6 +788,29 @@ size_t XCDeviceStackData::get_mem_req( reqt.grid_vtau_size(npts) * sizeof(double) + reqt.grid_vlapl_size(npts) * sizeof(double) ; + // second derivatives + mem_req += + // U variables + reqt.grid_tden_size(npts) * sizeof(double) + + reqt.grid_tden_grad_size(npts) * sizeof(double) + + reqt.grid_tlapl_size(npts) * sizeof(double) + + reqt.grid_ttau_size(npts) * sizeof(double) + + // XC output + reqt.grid_v2rho2_size(npts) * sizeof(double) + + reqt.grid_v2rhogamma_size(npts) * sizeof(double) + + reqt.grid_v2rholapl_size(npts) * sizeof(double) + + reqt.grid_v2rhotau_size(npts) * sizeof(double) + + reqt.grid_v2gamma2_size(npts) * sizeof(double) + + reqt.grid_v2gammalapl_size(npts) * sizeof(double) + + reqt.grid_v2gammatau_size(npts) * sizeof(double) + + reqt.grid_v2lapl2_size(npts) * sizeof(double) + + reqt.grid_v2lapltau_size(npts) * sizeof(double) + + reqt.grid_v2tau2_size(npts) * sizeof(double) + + // intermediate output + reqt.grid_FXC_A_size(npts) * sizeof(double) + + reqt.grid_FXC_B_size(npts) * sizeof(double) + + reqt.grid_FXC_C_size(npts) * sizeof(double); + return mem_req; } @@ -708,60 +864,85 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack( // Grid function evaluations if( reqt.grid_den ) { // Density - base_stack.den_s_eval_device = mem.aligned_alloc(msz, aln, csl); - if( is_pol ) { base_stack.den_eval_device = mem.aligned_alloc(2*msz, aln, csl); - base_stack.den_z_eval_device = mem.aligned_alloc(msz, aln, csl); - if( is_gks ){ base_stack.den_y_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.den_x_eval_device = mem.aligned_alloc(msz, aln, csl); } + base_stack.den_s_eval_device = mem.aligned_alloc(msz, aln, csl); + + if(is_pol) { + base_stack.den_interleaved_device = mem.aligned_alloc(2*msz, aln, csl); + base_stack.den_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + + if(is_gks){ + base_stack.den_y_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.den_x_eval_device = mem.aligned_alloc(msz, aln, csl); } } if( reqt.grid_den_grad ) { // Density gradient - base_stack.dden_sx_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_sy_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_sz_eval_device = mem.aligned_alloc(msz, aln, csl); - - if( is_pol ) { base_stack.dden_zx_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_zy_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_zz_eval_device = mem.aligned_alloc(msz, aln, csl); - if( is_gks ) { base_stack.dden_yx_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_yy_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_yz_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_xx_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_xy_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.dden_xz_eval_device = mem.aligned_alloc(msz, aln, csl); } + base_stack.dden_sx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_sy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_sz_eval_device = mem.aligned_alloc(msz, aln, csl); + + if(is_pol) { + base_stack.dden_zx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_zy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_zz_eval_device = mem.aligned_alloc(msz, aln, csl); + } + if( is_gks ) { + base_stack.dden_yx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_yy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_yz_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_xx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_xy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.dden_xz_eval_device = mem.aligned_alloc(msz, aln, csl); } } - if( reqt.grid_den_lapl ) { // Density Laplacian - base_stack.den_lapl_eval_device = mem.aligned_alloc(msz, aln, csl); + if( reqt.grid_tau ) { // Tau + base_stack.tau_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if(is_pol) { + base_stack.tau_interleaved_device = mem.aligned_alloc(2*msz, aln, csl); + base_stack.tau_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_lapl ) { // Density Laplacian + base_stack.lapl_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if(is_pol) { + base_stack.lapl_interleaved_device = mem.aligned_alloc(2*msz, aln, csl); + base_stack.lapl_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } } if( reqt.grid_gamma ) { // Gamma - if( is_pol ) { base_stack.gamma_eval_device = mem.aligned_alloc(3 * msz, aln, csl); - base_stack.gamma_pp_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.gamma_pm_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.gamma_mm_eval_device = mem.aligned_alloc(msz, aln, csl); } - else base_stack.gamma_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { + base_stack.gamma_eval_device = mem.aligned_alloc(3 * msz, aln, csl); + base_stack.gamma_pp_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.gamma_pm_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.gamma_mm_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.gamma_eval_device = mem.aligned_alloc(msz, aln, csl); + } } if( reqt.grid_vrho ) { // Vrho - if( is_pol ) { base_stack.vrho_eval_device = mem.aligned_alloc(2 * msz, aln, csl); - base_stack.vrho_pos_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.vrho_neg_eval_device = mem.aligned_alloc(msz, aln, csl); } - else base_stack.vrho_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { + base_stack.vrho_eval_device = mem.aligned_alloc(2 * msz, aln, csl); + base_stack.vrho_pos_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vrho_neg_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.vrho_eval_device = mem.aligned_alloc(msz, aln, csl); + } } if( reqt.grid_vgamma ) { // Vgamma - if( is_pol ) { base_stack.vgamma_eval_device = mem.aligned_alloc(3*msz, aln, csl); - base_stack.vgamma_pp_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.vgamma_pm_eval_device = mem.aligned_alloc(msz, aln, csl); - base_stack.vgamma_mm_eval_device = mem.aligned_alloc(msz, aln, csl); } - else base_stack.vgamma_eval_device = mem.aligned_alloc(msz, aln, csl); - } - - if( reqt.grid_tau ) { // Tau - base_stack.tau_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { + base_stack.vgamma_eval_device = mem.aligned_alloc(3*msz, aln, csl); + base_stack.vgamma_pp_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vgamma_pm_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vgamma_mm_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.vgamma_eval_device = mem.aligned_alloc(msz, aln, csl); + } } if( is_gks ) { // H, K matrices @@ -780,11 +961,224 @@ XCDeviceStackData::device_buffer_t XCDeviceStackData::allocate_dynamic_stack( } if( reqt.grid_vtau ) { // Vtau - base_stack.vtau_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { + base_stack.vtau_eval_device = mem.aligned_alloc(2 * msz, aln, csl); + base_stack.vtau_pos_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vtau_neg_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.vtau_eval_device = mem.aligned_alloc(msz, aln, csl); + } } if( reqt.grid_vlapl ) { // Vlapl - base_stack.vlapl_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { + base_stack.vlapl_eval_device = mem.aligned_alloc(2 * msz, aln, csl); + base_stack.vlapl_pos_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.vlapl_neg_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.vlapl_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( terms.fxc_contraction ) { + // Trial density evaluation + if( reqt.grid_tden ) { + base_stack.tden_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if(is_pol) { + base_stack.tden_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + if(is_gks){ + base_stack.tden_y_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tden_x_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + // Trial density gradient + if( reqt.grid_tden_grad ) { + base_stack.tdden_sx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_sy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_sz_eval_device = mem.aligned_alloc(msz, aln, csl); + + if(is_pol) { + base_stack.tdden_zx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_zy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_zz_eval_device = mem.aligned_alloc(msz, aln, csl); + } + if( is_gks ) { + base_stack.tdden_yx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_yy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_yz_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_xx_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_xy_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.tdden_xz_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + // Trial tau + if( reqt.grid_ttau ) { + base_stack.ttau_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if(is_pol) { + base_stack.ttau_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + // Trial laplacian + if( reqt.grid_tlapl ) { + base_stack.tlapl_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if(is_pol) { + base_stack.tlapl_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + // Second derivatives of XC functional + if( reqt.grid_v2rho2 ) { + if( is_pol ) { + base_stack.v2rho2_eval_device = mem.aligned_alloc(3 * msz, aln, csl); + base_stack.v2rho2_a_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rho2_a_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rho2_b_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2rho2_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2rhogamma ) { + if( is_pol ) { + base_stack.v2rhogamma_eval_device = mem.aligned_alloc(6 * msz, aln, csl); + base_stack.v2rhogamma_a_aa_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhogamma_a_ab_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhogamma_a_bb_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhogamma_b_aa_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhogamma_b_ab_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhogamma_b_bb_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2rhogamma_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2rholapl ) { + if( is_pol ) { + base_stack.v2rholapl_eval_device = mem.aligned_alloc(4 * msz, aln, csl); + base_stack.v2rholapl_a_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rholapl_a_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rholapl_b_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rholapl_b_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2rholapl_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2rhotau ) { + if( is_pol ) { + base_stack.v2rhotau_eval_device = mem.aligned_alloc(4 * msz, aln, csl); + base_stack.v2rhotau_a_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhotau_a_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhotau_b_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2rhotau_b_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2rhotau_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2gamma2 ) { + if( is_pol ) { + base_stack.v2gamma2_eval_device = mem.aligned_alloc(6 * msz, aln, csl); + base_stack.v2gamma2_aa_aa_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gamma2_aa_ab_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gamma2_aa_bb_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gamma2_ab_ab_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gamma2_ab_bb_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gamma2_bb_bb_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2gamma2_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2gammalapl ) { + if( is_pol ) { + base_stack.v2gammalapl_eval_device = mem.aligned_alloc(6 * msz, aln, csl); + base_stack.v2gammalapl_aa_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammalapl_aa_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammalapl_ab_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammalapl_ab_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammalapl_bb_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammalapl_bb_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2gammalapl_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2gammatau ) { + if( is_pol ) { + base_stack.v2gammatau_eval_device = mem.aligned_alloc(6 * msz, aln, csl); + base_stack.v2gammatau_aa_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammatau_aa_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammatau_ab_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammatau_ab_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammatau_bb_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2gammatau_bb_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2gammatau_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2lapl2 ) { + if( is_pol ) { + base_stack.v2lapl2_eval_device = mem.aligned_alloc(3 * msz, aln, csl); + base_stack.v2lapl2_a_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2lapl2_a_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2lapl2_b_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2lapl2_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2lapltau ) { + if( is_pol ) { + base_stack.v2lapltau_eval_device = mem.aligned_alloc(4 * msz, aln, csl); + base_stack.v2lapltau_a_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2lapltau_a_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2lapltau_b_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2lapltau_b_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2lapltau_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_v2tau2 ) { + if( is_pol ) { + base_stack.v2tau2_eval_device = mem.aligned_alloc(3 * msz, aln, csl); + base_stack.v2tau2_a_a_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2tau2_a_b_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.v2tau2_b_b_eval_device = mem.aligned_alloc(msz, aln, csl); + } else { + base_stack.v2tau2_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + // Intermediate matrices for contraction + if( reqt.grid_FXC_A ) { + base_stack.FXC_A_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) + base_stack.FXC_A_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + + if( reqt.grid_FXC_B ) { + base_stack.FXC_Bx_s_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.FXC_By_s_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.FXC_Bz_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) { + base_stack.FXC_Bx_z_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.FXC_By_z_eval_device = mem.aligned_alloc(msz, aln, csl); + base_stack.FXC_Bz_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } + } + + if( reqt.grid_FXC_C ) { + base_stack.FXC_C_s_eval_device = mem.aligned_alloc(msz, aln, csl); + if( is_pol ) + base_stack.FXC_C_z_eval_device = mem.aligned_alloc(msz, aln, csl); + } } diff --git a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp index e1a72ec4..cf5399a8 100644 --- a/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_stack_data.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -55,6 +59,7 @@ struct XCDeviceStackData : public XCDeviceData { double* exx_k_device = nullptr; ///< EXX K storage (nbf,nbf) double* acc_scr_device = nullptr; ///< Accumulaion scratch (1) double* exc_grad_device = nullptr; ///< EXC Gradient storage (3*natoms) + double* fxc_device = nullptr; ///< FXC contraction storage (nbf,nbf) double* vshell_max_sparse_device = nullptr; size_t* shpair_row_ind_device = nullptr; @@ -72,8 +77,63 @@ struct XCDeviceStackData : public XCDeviceData { double* vxc_z_device = nullptr; /// Ditto for Z,Y,X densities double* vxc_y_device = nullptr; double* vxc_x_device = nullptr; + + // Second derivatives + double* tdmat_s_device = nullptr; ///< Static trial density matrix storage (nbf,nbf) + double* tdmat_z_device = nullptr; /// Ditto for Z,Y,X trial densities + double* tdmat_y_device = nullptr; + double* tdmat_x_device = nullptr; + double* fxc_s_device = nullptr; ///< FXC storage (nbf, nbf) + double* fxc_z_device = nullptr; /// Ditto for Z,Y,X densities + double* fxc_y_device = nullptr; + double* fxc_x_device = nullptr; inline void reset() { std::memset( this, 0, sizeof(static_data) ); } + + inline double* den_selector(density_id den) { + switch(den) { + case DEN_S: return dmat_s_device; + case DEN_Z: return dmat_z_device; + case DEN_Y: return dmat_y_device; + case DEN_X: return dmat_x_device; + default: GAUXC_GENERIC_EXCEPTION("den_selector: density_id not recognized"); + } + return nullptr; + } + + inline double* vxc_selector(density_id den) { + switch(den) { + case DEN_S: return vxc_s_device; + case DEN_Z: return vxc_z_device; + case DEN_Y: return vxc_y_device; + case DEN_X: return vxc_x_device; + default: GAUXC_GENERIC_EXCEPTION("vxc_selector: density_id not recognized"); + } + return nullptr; + } + + inline double* tden_selector(density_id den) { + switch(den) { + case DEN_S: return tdmat_s_device; + case DEN_Z: return tdmat_z_device; + case DEN_Y: return tdmat_y_device; + case DEN_X: return tdmat_x_device; + default: GAUXC_GENERIC_EXCEPTION("tden_selector: density_id not recognized"); + } + return nullptr; + } + + inline double* fxc_selector(density_id den) { + switch(den) { + case DEN_S: return fxc_s_device; + case DEN_Z: return fxc_z_device; + case DEN_Y: return fxc_y_device; + case DEN_X: return fxc_x_device; + default: GAUXC_GENERIC_EXCEPTION("fxc_selector: density_id not recognized"); + } + return nullptr; + } + }; XCDeviceShellPairSoA shell_pair_soa; @@ -92,15 +152,19 @@ struct XCDeviceStackData : public XCDeviceData { double* weights_device = nullptr; ///< Grid weights for task batch // U variables - double* den_s_eval_device = nullptr; ///< scalar density for task batch - double* dden_sx_eval_device = nullptr; ///< d/dx scalar density for task batch - double* dden_sy_eval_device = nullptr; ///< d/dy scalar density for task batch - double* dden_sz_eval_device = nullptr; ///< d/dz scalar density for task batch + double* den_s_eval_device = nullptr; ///< scalar density for task batch + double* dden_sx_eval_device = nullptr; ///< d/dx scalar density for task batch + double* dden_sy_eval_device = nullptr; ///< d/dy scalar density for task batch + double* dden_sz_eval_device = nullptr; ///< d/dz scalar density for task batch + double* tau_s_eval_device = nullptr; ///< scalar tau for task batch + double* lapl_s_eval_device = nullptr; ///< scalar density laplacian for task batch - double* den_z_eval_device = nullptr; ///< z density for task batch - double* dden_zx_eval_device = nullptr; ///< d/dx z density for task batch - double* dden_zy_eval_device = nullptr; ///< d/dy z density for task batch - double* dden_zz_eval_device = nullptr; ///< d/dz z density for task batch + double* den_z_eval_device = nullptr; ///< z density for task batch + double* dden_zx_eval_device = nullptr; ///< d/dx z density for task batch + double* dden_zy_eval_device = nullptr; ///< d/dy z density for task batch + double* dden_zz_eval_device = nullptr; ///< d/dz z density for task batch + double* tau_z_eval_device = nullptr; ///< z tau for task batch + double* lapl_z_eval_device = nullptr; ///< z density laplacian for task batch double* den_y_eval_device = nullptr; ///< y density for task batch double* dden_yx_eval_device = nullptr; ///< d/dx y density for task batch @@ -112,21 +176,25 @@ struct XCDeviceStackData : public XCDeviceData { double* dden_xy_eval_device = nullptr; ///< d/dy x density for task batch double* dden_xz_eval_device = nullptr; ///< d/dz x density for task batch - double* den_eval_device = nullptr; /// Storage for interleaved density (non-RKS only) - - double* den_lapl_eval_device = nullptr; ///< density Laplacian for task batch + double* den_interleaved_device = nullptr; /// Storage for interleaved density (non-RKS only) + double* tau_interleaved_device = nullptr; /// Storage for interleaved tau (non-RKS only) + double* lapl_interleaved_device = nullptr; /// Storage for interleaved lapl (non-RKS only) // V variables / XC output double* gamma_eval_device = nullptr; ///< gamma for task batch - double* tau_eval_device = nullptr; ///< tau for task batch double* eps_eval_device = nullptr; ///< XC energy density for task batch double* vrho_eval_device = nullptr; ///< Rho XC derivative for task batch double* vgamma_eval_device = nullptr; ///< Gamma XC derivative for task batch double* vtau_eval_device = nullptr; ///< Tau XC derivative for task batch double* vlapl_eval_device = nullptr; ///< Lapl XC derivative for task batch - double* vrho_pos_eval_device = nullptr; ///< Polarized Rho+ XC derivative for task batch - double* vrho_neg_eval_device = nullptr; ///< Polarized Rho+ XC derivative for task batch + double* vrho_pos_eval_device = nullptr; ///< Polarized Rho+ XC derivative for task batch + double* vrho_neg_eval_device = nullptr; ///< Polarized Rho+ XC derivative for task batch + double* vtau_pos_eval_device = nullptr; + double* vtau_neg_eval_device = nullptr; + double* vlapl_pos_eval_device = nullptr; + double* vlapl_neg_eval_device = nullptr; + double* gamma_pp_eval_device = nullptr; ///< Polarized Gamma++ for task batch double* gamma_pm_eval_device = nullptr; ///< Polarized Gamma+- for task batch @@ -142,6 +210,101 @@ struct XCDeviceStackData : public XCDeviceData { double* K_y_eval_device = nullptr; ///< norm(m) dependent LDA Y transformation factor for task batch double* K_z_eval_device = nullptr; ///< norm(m) dependent LDA Z transformation factor for task batch + // Second derivative intermediates - Trial variables (T) + double* tden_s_eval_device = nullptr; ///< scalar trial density for task batch + double* tdden_sx_eval_device = nullptr; ///< d/dx scalar trial density for task batch + double* tdden_sy_eval_device = nullptr; ///< d/dy scalar trial density for task batch + double* tdden_sz_eval_device = nullptr; ///< d/dz scalar trial density for task batch + double* ttau_s_eval_device = nullptr; ///< scalar trial tau for task batch + double* tlapl_s_eval_device = nullptr; ///< scalar trial density laplacian for task batch + + double* tden_z_eval_device = nullptr; ///< z trial density for task batch + double* tdden_zx_eval_device = nullptr; ///< d/dx z trial density for task batch + double* tdden_zy_eval_device = nullptr; ///< d/dy z trial density for task batch + double* tdden_zz_eval_device = nullptr; ///< d/dz z trial density for task batch + double* ttau_z_eval_device = nullptr; ///< z trial tau for task batch + double* tlapl_z_eval_device = nullptr; ///< z trial density laplacian for task batch + + double* tden_y_eval_device = nullptr; ///< y trial density for task batch + double* tdden_yx_eval_device = nullptr; ///< d/dx y trial density for task batch + double* tdden_yy_eval_device = nullptr; ///< d/dy y trial density for task batch + double* tdden_yz_eval_device = nullptr; ///< d/dz y trial density for task batch + + double* tden_x_eval_device = nullptr; ///< x trial density for task batch + double* tdden_xx_eval_device = nullptr; ///< d/dx x trial density for task batch + double* tdden_xy_eval_device = nullptr; ///< d/dy x trial density for task batch + double* tdden_xz_eval_device = nullptr; ///< d/dz x trial density for task batch + + // Second derivative kernel outputs (V2 variables) + double* v2rho2_eval_device = nullptr; ///< 2nd derivative of XC wrt rho^2 + double* v2rhogamma_eval_device = nullptr; ///< 2nd derivative of XC wrt rho-gamma + double* v2rholapl_eval_device = nullptr; ///< 2nd derivative of XC wrt rho-lapl + double* v2rhotau_eval_device = nullptr; ///< 2nd derivative of XC wrt rho-tau + double* v2gamma2_eval_device = nullptr; ///< 2nd derivative of XC wrt gamma^2 + double* v2gammalapl_eval_device = nullptr; ///< 2nd derivative of XC wrt gamma-lapl + double* v2gammatau_eval_device = nullptr; ///< 2nd derivative of XC wrt gamma-tau + double* v2lapl2_eval_device = nullptr; ///< 2nd derivative of XC wrt lapl^2 + double* v2lapltau_eval_device = nullptr; ///< 2nd derivative of XC wrt lapl-tau + double* v2tau2_eval_device = nullptr; ///< 2nd derivative of XC wrt tau^2 + // in unrestricted case, these are 2nd derivatives of XC with alpha (+) and beta (-) densities + double* v2rho2_a_a_eval_device = nullptr; + double* v2rho2_a_b_eval_device = nullptr; + double* v2rho2_b_b_eval_device = nullptr; + double* v2rhogamma_a_aa_eval_device = nullptr; + double* v2rhogamma_a_ab_eval_device = nullptr; + double* v2rhogamma_a_bb_eval_device = nullptr; + double* v2rhogamma_b_aa_eval_device = nullptr; + double* v2rhogamma_b_ab_eval_device = nullptr; + double* v2rhogamma_b_bb_eval_device = nullptr; + double* v2rholapl_a_a_eval_device = nullptr; + double* v2rholapl_a_b_eval_device = nullptr; + double* v2rholapl_b_a_eval_device = nullptr; + double* v2rholapl_b_b_eval_device = nullptr; + double* v2rhotau_a_a_eval_device = nullptr; + double* v2rhotau_a_b_eval_device = nullptr; + double* v2rhotau_b_a_eval_device = nullptr; + double* v2rhotau_b_b_eval_device = nullptr; + double* v2gamma2_aa_aa_eval_device = nullptr; + double* v2gamma2_aa_ab_eval_device = nullptr; + double* v2gamma2_aa_bb_eval_device = nullptr; + double* v2gamma2_ab_ab_eval_device = nullptr; + double* v2gamma2_ab_bb_eval_device = nullptr; + double* v2gamma2_bb_bb_eval_device = nullptr; + double* v2gammalapl_aa_a_eval_device = nullptr; + double* v2gammalapl_aa_b_eval_device = nullptr; + double* v2gammalapl_ab_a_eval_device = nullptr; + double* v2gammalapl_ab_b_eval_device = nullptr; + double* v2gammalapl_bb_a_eval_device = nullptr; + double* v2gammalapl_bb_b_eval_device = nullptr; + double* v2gammatau_aa_a_eval_device = nullptr; + double* v2gammatau_aa_b_eval_device = nullptr; + double* v2gammatau_ab_a_eval_device = nullptr; + double* v2gammatau_ab_b_eval_device = nullptr; + double* v2gammatau_bb_a_eval_device = nullptr; + double* v2gammatau_bb_b_eval_device = nullptr; + double* v2lapl2_a_a_eval_device = nullptr; + double* v2lapl2_a_b_eval_device = nullptr; + double* v2lapl2_b_b_eval_device = nullptr; + double* v2lapltau_a_a_eval_device = nullptr; + double* v2lapltau_a_b_eval_device = nullptr; + double* v2lapltau_b_a_eval_device = nullptr; + double* v2lapltau_b_b_eval_device = nullptr; + double* v2tau2_a_a_eval_device = nullptr; + double* v2tau2_a_b_eval_device = nullptr; + double* v2tau2_b_b_eval_device = nullptr; + + // Second derivative kernel outputs (A,B,C variables) + double* FXC_A_s_eval_device = nullptr; + double* FXC_Bx_s_eval_device = nullptr; + double* FXC_By_s_eval_device = nullptr; + double* FXC_Bz_s_eval_device = nullptr; + double* FXC_C_s_eval_device = nullptr; + double* FXC_A_z_eval_device = nullptr; + double* FXC_Bx_z_eval_device = nullptr; + double* FXC_By_z_eval_device = nullptr; + double* FXC_Bz_z_eval_device = nullptr; + double* FXC_C_z_eval_device = nullptr; + inline void reset() { std::memset( this, 0, sizeof(base_stack_data) ); } }; @@ -161,25 +324,33 @@ struct XCDeviceStackData : public XCDeviceData { host_task_iterator, host_task_iterator) override final; void allocate_static_data_weights( int32_t natoms ) override final; void allocate_static_data_exc_vxc( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms, bool do_vxc ) override final; + void allocate_static_data_fxc_contraction( int32_t nbf, int32_t nshells, integrator_term_tracker enabled_terms ) override final; void allocate_static_data_den( int32_t nbf, int32_t nshells ) override final; - void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms ) override final; + void allocate_static_data_exc_grad( int32_t nbf, int32_t nshells, int32_t natoms, integrator_term_tracker enabled_terms ) override final; void allocate_static_data_exx( int32_t nbf, int32_t nshells, size_t nshell_pairs, size_t nprim_pair_total, int32_t max_l ) override final; void allocate_static_data_exx_ek_screening( size_t ntasks, int32_t nbf, int32_t nshells, int nshell_pairs, int32_t max_l ) override final; void send_static_data_weights( const Molecule& mol, const MolMeta& meta ) override final; void send_static_data_density_basis( const double* Ps, int32_t ldps, const double* Pz, int32_t ldpz, const double* Py, int32_t ldpy, const double* Px, int32_t ldpx, const BasisSet& basis ) override final; + void send_static_data_trial_density( + const double* tPs, int32_t ldtps, const double* tPz, int32_t ldtpz, + const double* tPy, int32_t ldtpy, const double* tPx, int32_t ldtpx ) override final; void send_static_data_shell_pairs( const BasisSet&, const ShellPairCollection& ) override final; void send_static_data_exx_ek_screening( const double* V_max, int32_t ldv, const BasisSetMap&, const ShellPairCollection& ) override final; void zero_den_integrands() override final; void zero_exc_vxc_integrands(integrator_term_tracker t) override final; + void zero_fxc_contraction_integrands() override final; void zero_exc_grad_integrands() override final; void zero_exx_integrands() override final; void zero_exx_ek_screening_intermediates() override final; void retrieve_exc_vxc_integrands( double* EXC, double* N_EL, double* VXCscalar, int32_t ldvxcscalar, double* VXCz, int32_t ldvxcz, double* VXCy , int32_t ldvxcy , double* VXCx, int32_t ldvxcx ) override final; + void retrieve_fxc_contraction_integrands( double* N_EL, + double* FXCs, int32_t ldfxcs, double* FXCz, int32_t ldfxcz, + double* FXCy, int32_t ldfxcy, double* FXCx, int32_t ldfxcx ) override final; void retrieve_exc_grad_integrands( double* EXC_GRAD, double* N_EL ) override final; void retrieve_den_integrands( double* N_EL ) override final; void retrieve_exx_integrands( double* K, int32_t ldk ) override final; @@ -193,6 +364,10 @@ struct XCDeviceStackData : public XCDeviceData { double* exc_device_data() override; double* nel_device_data() override; double* exx_k_device_data() override; + double* fxc_s_device_data() override; + double* fxc_z_device_data() override; + double* fxc_y_device_data() override; + double* fxc_x_device_data() override; device_queue queue() override; diff --git a/src/xc_integrator/xc_data/device/xc_device_task.hpp b/src/xc_integrator/xc_data/device/xc_device_task.hpp index 696ef185..58ab323c 100644 --- a/src/xc_integrator/xc_data/device/xc_device_task.hpp +++ b/src/xc_integrator/xc_data/device/xc_device_task.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -52,16 +56,29 @@ struct XCDeviceTask { double* d2bfzz = nullptr; double* eps = nullptr; - double* den = nullptr; - double* gamma = nullptr; - double* vrho = nullptr; - double* vgamma = nullptr; + double* den = nullptr; + double* gamma = nullptr; + double* tau = nullptr; + double* lapl = nullptr; + double* vrho = nullptr; + double* vgamma = nullptr; + double* vtau = nullptr; + double* vlapl = nullptr; // (S,Z,Y,X) densities double* den_s = nullptr; double* den_z = nullptr; double* den_y = nullptr; double* den_x = nullptr; + double* tau_s = nullptr; + double* tau_z = nullptr; + double* tau_y = nullptr; + double* tau_x = nullptr; + double* lapl_s = nullptr; + double* lapl_z = nullptr; + double* lapl_y = nullptr; + double* lapl_x = nullptr; + // Del(S,Z,Y,X) Gradients double* dden_sx = nullptr; double* dden_sy = nullptr; @@ -85,6 +102,10 @@ struct XCDeviceTask { double* vgamma_pp = nullptr; double* vgamma_pm = nullptr; double* vgamma_mm = nullptr; + double* vtau_pos = nullptr; + double* vtau_neg = nullptr; + double* vlapl_pos = nullptr; + double* vlapl_neg = nullptr; // GKS K,H matrices double* K_z = nullptr; @@ -96,10 +117,121 @@ struct XCDeviceTask { // MGGA double* d2bflapl = nullptr; - double* denlapl = nullptr; - double* tau = nullptr; - double* vtau = nullptr; - double* vlapl = nullptr; + double* d3bflapl_x = nullptr; + double* d3bflapl_y = nullptr; + double* d3bflapl_z = nullptr; + + // Persistent X matrices for EXC gradients + double* xmatS = nullptr; + double* xmatS_x = nullptr; + double* xmatS_y = nullptr; + double* xmatS_z = nullptr; + double* xmatZ = nullptr; + double* xmatZ_x = nullptr; + double* xmatZ_y = nullptr; + double* xmatZ_z = nullptr; + + // Second derivatives - Trial density and derivatives + double* tden = nullptr; + double* ttau = nullptr; + double* tlapl = nullptr; + double* v2rho2 = nullptr; + double* v2rhogamma = nullptr; + double* v2rholapl = nullptr; + double* v2rhotau = nullptr; + double* v2gamma2 = nullptr; + double* v2gammalapl = nullptr; + double* v2gammatau = nullptr; + double* v2lapl2 = nullptr; + double* v2lapltau = nullptr; + double* v2tau2 = nullptr; + + // (S,Z,Y,X) trial densities + double* tden_s = nullptr; + double* tden_z = nullptr; + double* tden_y = nullptr; + double* tden_x = nullptr; + double* ttau_s = nullptr; + double* ttau_z = nullptr; + double* ttau_y = nullptr; + double* ttau_x = nullptr; + double* tlapl_s = nullptr; + double* tlapl_z = nullptr; + double* tlapl_y = nullptr; + double* tlapl_x = nullptr; + + // Del(S,Z,Y,X) trial density gradients + double* tdden_sx = nullptr; + double* tdden_sy = nullptr; + double* tdden_sz = nullptr; + double* tdden_zx = nullptr; + double* tdden_zy = nullptr; + double* tdden_zz = nullptr; + double* tdden_yx = nullptr; + double* tdden_yy = nullptr; + double* tdden_yz = nullptr; + double* tdden_xx = nullptr; + double* tdden_xy = nullptr; + double* tdden_xz = nullptr; + + //2C U variables for second derivatives + double* v2rho2_a_a = nullptr; + double* v2rho2_a_b = nullptr; + double* v2rho2_b_b = nullptr; + double* v2rhogamma_a_aa = nullptr; + double* v2rhogamma_a_ab = nullptr; + double* v2rhogamma_a_bb = nullptr; + double* v2rhogamma_b_aa = nullptr; + double* v2rhogamma_b_ab = nullptr; + double* v2rhogamma_b_bb = nullptr; + double* v2rholapl_a_a = nullptr; + double* v2rholapl_a_b = nullptr; + double* v2rholapl_b_a = nullptr; + double* v2rholapl_b_b = nullptr; + double* v2rhotau_a_a = nullptr; + double* v2rhotau_a_b = nullptr; + double* v2rhotau_b_a = nullptr; + double* v2rhotau_b_b = nullptr; + double* v2gamma2_aa_aa = nullptr; + double* v2gamma2_aa_ab = nullptr; + double* v2gamma2_aa_bb = nullptr; + double* v2gamma2_ab_ab = nullptr; + double* v2gamma2_ab_bb = nullptr; + double* v2gamma2_bb_bb = nullptr; + double* v2gammalapl_aa_a = nullptr; + double* v2gammalapl_aa_b = nullptr; + double* v2gammalapl_ab_a = nullptr; + double* v2gammalapl_ab_b = nullptr; + double* v2gammalapl_bb_a = nullptr; + double* v2gammalapl_bb_b = nullptr; + double* v2gammatau_aa_a = nullptr; + double* v2gammatau_aa_b = nullptr; + double* v2gammatau_ab_a = nullptr; + double* v2gammatau_ab_b = nullptr; + double* v2gammatau_bb_a = nullptr; + double* v2gammatau_bb_b = nullptr; + double* v2lapl2_a_a = nullptr; + double* v2lapl2_a_b = nullptr; + double* v2lapl2_b_b = nullptr; + double* v2lapltau_a_a = nullptr; + double* v2lapltau_a_b = nullptr; + double* v2lapltau_b_a = nullptr; + double* v2lapltau_b_b = nullptr; + double* v2tau2_a_a = nullptr; + double* v2tau2_a_b = nullptr; + double* v2tau2_b_b = nullptr; + + // Second derivatives intermediate output + double* FXC_A_s = nullptr; + double* FXC_Bx_s = nullptr; + double* FXC_By_s = nullptr; + double* FXC_Bz_s = nullptr; + double* FXC_C_s = nullptr; + double* FXC_A_z = nullptr; + double* FXC_Bx_z = nullptr; + double* FXC_By_z = nullptr; + double* FXC_Bz_z = nullptr; + double* FXC_C_z = nullptr; int32_t iParent = -1; double dist_nearest = 0.; diff --git a/tests/2nd_derivative_test.cxx b/tests/2nd_derivative_test.cxx new file mode 100644 index 00000000..eedcb27f --- /dev/null +++ b/tests/2nd_derivative_test.cxx @@ -0,0 +1,243 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include "ut_common.hpp" +#include +#include +#include +#include + +#include + +#include +#include +#include + +using namespace GauXC; + + +void test_fxc_contractioin(ExecutionSpace ex, const RuntimeEnvironment& rt, + std::string reference_file, + functional_type& func, + PruningScheme pruning_scheme, + std::string integrator_kernel = "Default", + std::string reduction_kernel = "Default", + std::string lwd_kernel = "Default") { + + // Read the reference file + using matrix_type = Eigen::MatrixXd; + Molecule mol; + BasisSet basis; + matrix_type P, Pz, tP, tPz, FXC_ref, FXCz_ref; + bool rks = true, uks = false; + + { + read_hdf5_record( mol, reference_file, "/MOLECULE" ); + read_hdf5_record( basis, reference_file, "/BASIS" ); + + HighFive::File file( reference_file, HighFive::File::ReadOnly ); + + std::string den = "/DENSITY"; + std::string tden_str = "/TRIAL_DENSITY"; + std::string fxc_str = "/FXC"; + std::string den2 = "/DENSITY_Z"; + + if (file.exist("/DENSITY_Z")) { + rks = false; + uks = true; + if (file.exist("/DENSITY_Y") && file.exist("/DENSITY_X")) { + std::cout << "FXC contraction for GKS is not supported yet. Skipping test." << std::endl; + return; + } + } + + if (uks) { + tden_str = "/TRIAL_DENSITY_SCALAR"; + den = "/DENSITY_SCALAR"; + fxc_str = "/FXC_SCALAR"; + } + + auto dset = file.getDataSet(den); + auto dims = dset.getDimensions(); + + P = matrix_type(dims[0], dims[1]); + dset.read(P.data()); + + if (not rks) { + Pz = matrix_type(dims[0], dims[1]); + dset = file.getDataSet(den2); + dset.read(Pz.data()); + } + + tP = matrix_type(dims[0], dims[1]); + dset = file.getDataSet(tden_str); + dset.read(tP.data()); + FXC_ref = matrix_type(dims[0], dims[1]); + dset = file.getDataSet(fxc_str); + dset.read(FXC_ref.data()); + + if (not rks) { + FXCz_ref = matrix_type(dims[0], dims[1]); + dset = file.getDataSet("/FXC_Z"); + dset.read(FXCz_ref.data()); + tPz = matrix_type(dims[0], dims[1]); + dset = file.getDataSet("/TRIAL_DENSITY_Z"); + dset.read(tPz.data()); + } + } + + // Set shell tolerance + for (auto& sh : basis) + sh.set_shell_tolerance(std::numeric_limits::epsilon()); + + // Create molecular grid + auto mg = MolGridFactory::create_default_molgrid(mol, pruning_scheme, + BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid); + + // Construct Load Balancer + LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default"); + auto lb = lb_factory.get_instance(rt, mol, mg, basis); + + // Construct Weights Module + MolecularWeightsFactory mw_factory(ex, "Default", MolecularWeightsSettings{}); + auto mw = mw_factory.get_instance(); + + // Apply partition weights + mw.modify_weights(lb); + + // Construct XCIntegrator + XCIntegratorFactory integrator_factory(ex, "Replicated", + integrator_kernel, lwd_kernel, reduction_kernel); + auto integrator = integrator_factory.get_instance(func, lb); + + // Test FXC contraction + if (rks) { + // Call FXC contraction + auto FXC = integrator.eval_fxc_contraction(P, tP); + auto FXC_diff_nrm = (FXC - FXC_ref).norm(); + CHECK(FXC_diff_nrm / basis.nbf() < 1e-10); + } else if (uks) { + // Call FXC contraction + auto [FXCs, FXCz] = integrator.eval_fxc_contraction(P, Pz, tP, tPz); + + auto FXCs_diff_nrm = (FXCs - FXC_ref).norm(); + auto FXCz_diff_nrm = (FXCz - FXCz_ref).norm(); + CHECK(FXCs_diff_nrm / basis.nbf() < 1e-10); + CHECK(FXCz_diff_nrm / basis.nbf() < 1e-10); + + } +} + +void test_integrator_2nd(std::string reference_file, functional_type& func, PruningScheme pruning_scheme) { + +#ifdef GAUXC_HAS_DEVICE + auto rt = DeviceRuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD,) 0.9); +#else + auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); +#endif + +#ifdef GAUXC_HAS_HOST + SECTION( "Host" ) { + SECTION("Reference") { + test_fxc_contractioin( ExecutionSpace::Host, rt, reference_file, func, + pruning_scheme, "Default", "Default", "Default" ); + } + } +#endif + +#ifdef GAUXC_HAS_DEVICE + SECTION( "Device" ) { + SECTION( "Incore - MPI Reduction" ) { + test_fxc_contractioin( ExecutionSpace::Device, rt, + reference_file, func, pruning_scheme, + "Default", "Default", "Default" ); + } + #ifdef GAUXC_HAS_CUTLASS + SECTION( "Incore - MPI Reduction - CUTLASS" ) { + test_fxc_contractioin( ExecutionSpace::Device, rt, + reference_file, func, pruning_scheme, + "Default", "Default", "Scheme1-CUTLASS" ); + } + #endif + + } +#endif + +} + +functional_type make_functional_2nd(ExchCXX::Functional func_key, ExchCXX::Spin spin) { + return functional_type(ExchCXX::Backend::builtin, func_key, spin); +} + + +TEST_CASE( "XC Integrator FXC", "[xc-integrator]" ) { + + auto pol = ExchCXX::Spin::Polarized; + auto unpol = ExchCXX::Spin::Unpolarized; + auto svwn5 = ExchCXX::Functional::SVWN5; + auto pbe0 = ExchCXX::Functional::PBE0; + auto blyp = ExchCXX::Functional::BLYP; + auto scan = ExchCXX::Functional::SCAN; + auto r2scanl = ExchCXX::Functional::R2SCANL; + auto m062x = ExchCXX::Functional::M062X; + + // LDA Test + SECTION( "Benzene / SVWN5 / cc-pVDZ" ) { + auto func = make_functional_2nd(svwn5, unpol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5", + func, PruningScheme::Unpruned ); + } + SECTION( "Benzene / SVWN5 / cc-pVDZ (Treutler)" ) { + auto func = make_functional_2nd(svwn5, unpol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5", + func, PruningScheme::Treutler ); + } + SECTION( "Benzene / SVWN5 / cc-pVDZ (Robust)" ) { + auto func = make_functional_2nd(svwn5, unpol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5", + func, PruningScheme::Robust ); + } + + // GGA Test + SECTION( "Benzene / PBE0 / cc-pVDZ" ) { + auto func = make_functional_2nd(pbe0, unpol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5", + func, PruningScheme::Unpruned ); + } + + // MGGA Test (TAU Only) + SECTION( "Cytosine / SCAN / cc-pVDZ") { + auto func = make_functional_2nd(scan, unpol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", + func, PruningScheme::Robust ); + } + + //UKS LDA Test + SECTION( "Li / SVWN5 / sto-3g" ) { + auto func = make_functional_2nd(svwn5, pol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/li_svwn5_sto3g_uks.bin", + func, PruningScheme::Unpruned ); + } + + //UKS GGA Test + SECTION( "Cytosine (doublet) / BLYP / cc-pVDZ") { + auto func = make_functional_2nd(blyp, pol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5", + func, PruningScheme::Robust ); + } + + // UKS MGGA Test (TAU Only) + SECTION( "Cytosine (doublet) / SCAN / cc-pVDZ") { + auto func = make_functional_2nd(scan, pol); + test_integrator_2nd(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5", + func, PruningScheme::Robust ); + } +} diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d3881e91..5f00d7db 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # @@ -57,9 +61,12 @@ add_executable( gauxc_test environment.cxx collocation.cxx weights.cxx + weight_derivative_test.cxx standards.cxx runtime.cxx basis/parse_basis.cxx + dd_psi_potential_test.cxx + 2nd_derivative_test.cxx ) target_link_libraries( gauxc_test PUBLIC gauxc gauxc_catch2 Eigen3::Eigen cereal ) if(GAUXC_ENABLE_CUTLASS) diff --git a/tests/basis/new/6-31g*.g94 b/tests/basis/new/6-31g-star.g94 similarity index 100% rename from tests/basis/new/6-31g*.g94 rename to tests/basis/new/6-31g-star.g94 diff --git a/tests/basis/old/6-31g*.g94 b/tests/basis/old/6-31g-star.g94 similarity index 100% rename from tests/basis/old/6-31g*.g94 rename to tests/basis/old/6-31g-star.g94 diff --git a/tests/basis/parse_basis.cxx b/tests/basis/parse_basis.cxx index aef5d612..0bf4cd8e 100644 --- a/tests/basis/parse_basis.cxx +++ b/tests/basis/parse_basis.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/basis/parse_basis.hpp b/tests/basis/parse_basis.hpp index 5815584c..1530aebf 100644 --- a/tests/basis/parse_basis.hpp +++ b/tests/basis/parse_basis.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/basisset_test.cxx b/tests/basisset_test.cxx index 5e556d10..29565336 100644 --- a/tests/basisset_test.cxx +++ b/tests/basisset_test.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/cmake/discovery/CMakeLists.txt b/tests/cmake/discovery/CMakeLists.txt index 3a03749f..e97fd4de 100644 --- a/tests/cmake/discovery/CMakeLists.txt +++ b/tests/cmake/discovery/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/tests/cmake/discovery/gauxc_link_tester.cxx b/tests/cmake/discovery/gauxc_link_tester.cxx index 2ba40e22..70313c7f 100644 --- a/tests/cmake/discovery/gauxc_link_tester.cxx +++ b/tests/cmake/discovery/gauxc_link_tester.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/cmake/subproject/CMakeLists.txt b/tests/cmake/subproject/CMakeLists.txt index 0bbf72b9..7bf08709 100644 --- a/tests/cmake/subproject/CMakeLists.txt +++ b/tests/cmake/subproject/CMakeLists.txt @@ -1,7 +1,11 @@ # # GauXC Copyright (c) 2020-2024, The Regents of the University of California, # through Lawrence Berkeley National Laboratory (subject to receipt of -# any required approvals from the U.S. Dept. of Energy). All rights reserved. +# any required approvals from the U.S. Dept. of Energy). +# +# (c) 2024-2025, Microsoft Corporation +# +# All rights reserved. # # See LICENSE.txt for details # diff --git a/tests/cmake/subproject/gauxc_link_tester.cxx b/tests/cmake/subproject/gauxc_link_tester.cxx index 2ba40e22..70313c7f 100644 --- a/tests/cmake/subproject/gauxc_link_tester.cxx +++ b/tests/cmake/subproject/gauxc_link_tester.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/collocation.cxx b/tests/collocation.cxx index fb8a0393..af85da77 100644 --- a/tests/collocation.cxx +++ b/tests/collocation.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -74,6 +78,14 @@ TEST_CASE( "Water / cc-pVDZ", "[collocation]" ) { SECTION( "CUDA Shell to Task Eval Hessian" ) { test_cuda_collocation_shell_to_task_hessian( basis, basis_map, ref_data ); } + + SECTION( "CUDA Shell to Task Eval Laplacian" ) { + test_cuda_collocation_shell_to_task_laplacian( basis, basis_map, ref_data ); + } + + SECTION( "CUDA Shell to Task Eval Laplacian Gradient" ) { + test_cuda_collocation_shell_to_task_lapgrad( basis, basis_map, ref_data ); + } #endif // GAUXC_HAS_CUDA #ifdef GAUXC_HAS_HIP diff --git a/tests/collocation_common.hpp b/tests/collocation_common.hpp index 91baa780..567f8f40 100644 --- a/tests/collocation_common.hpp +++ b/tests/collocation_common.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -32,10 +36,15 @@ struct ref_collocation_data { std::vector d2eval_yy; std::vector d2eval_yz; std::vector d2eval_zz; + std::vector d2eval_lapl; + std::vector d3eval_lapl_x; + std::vector d3eval_lapl_y; + std::vector d3eval_lapl_z; template void serialize( Archive& ar ) { - ar( mask, pts, eval, deval_x, deval_y, deval_z, d2eval_xx, d2eval_xy, d2eval_xz, d2eval_yy, d2eval_yz, d2eval_zz ); + ar( mask, pts, eval, deval_x, deval_y, deval_z, d2eval_xx, d2eval_xy, d2eval_xz, + d2eval_yy, d2eval_yz, d2eval_zz, d2eval_lapl, d3eval_lapl_x, d3eval_lapl_y, d3eval_lapl_z); } }; diff --git a/tests/collocation_cuda.hpp b/tests/collocation_cuda.hpp index 42ae0eb4..b74d8476 100644 --- a/tests/collocation_cuda.hpp +++ b/tests/collocation_cuda.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -14,7 +18,7 @@ auto populate_device_cuda( const BasisSet& basis, const std::vector& ref_data, - bool pop_grad, bool pop_hess ) { + bool pop_grad, bool pop_hess, bool pop_lapl, bool pop_lapl_grad ) { std::vector< XCDeviceTask > tasks; @@ -58,6 +62,16 @@ auto populate_device_cuda( const BasisSet& basis, task.d2bfzz = util::cuda_malloc( nbf * npts ); } + if(pop_lapl) { + task.d2bflapl = util::cuda_malloc( nbf * npts ); + } + + if(pop_lapl_grad) { + task.d3bflapl_x = util::cuda_malloc( nbf * npts ); + task.d3bflapl_y = util::cuda_malloc( nbf * npts ); + task.d3bflapl_z = util::cuda_malloc( nbf * npts ); + } + //auto* pts_device = task.points; auto* pts_x_device = task.points_x; auto* pts_y_device = task.points_y; @@ -95,7 +109,7 @@ auto populate_device_cuda( const BasisSet& basis, void cuda_check_collocation( const std::vector& tasks, const std::vector& ref_data, - bool check_grad, bool check_hess) { + bool check_grad, bool check_hess, bool check_lapl, bool check_lapl_grad) { for( int i = 0; i < tasks.size(); i++ ) { @@ -158,6 +172,34 @@ void cuda_check_collocation( const std::vector& tasks, check_collocation_transpose( npts, nbe, ref_d2eval_zz, d2eval_zz.data(), "IT = " + std::to_string(i) + " BFZZ EVAL" ); } + if( check_lapl ) { + auto npts = tasks[i].npts; + auto nbe = tasks[i].bfn_screening.nbe; + auto* ref_d2eval_lapl = ref_data[i].d2eval_lapl.data(); + std::vector d2eval_lapl(npts * nbe); + util::cuda_copy(eval.size(), d2eval_lapl.data(), tasks[i].d2bflapl); + check_collocation_transpose(npts, nbe, ref_d2eval_lapl, d2eval_lapl.data(), "IT = " + std::to_string(i) + "BFLAPL EVAL" ); + } + +#if 1 + if( check_lapl_grad ) { + auto npts = tasks[i].npts; + auto nbe = tasks[i].bfn_screening.nbe; + auto* ref_d3eval_lapl_x = ref_data[i].d3eval_lapl_x.data(); + auto* ref_d3eval_lapl_y = ref_data[i].d3eval_lapl_y.data(); + auto* ref_d3eval_lapl_z = ref_data[i].d3eval_lapl_z.data(); + std::vector d3eval_lapl_x(npts * nbe); + std::vector d3eval_lapl_y(npts * nbe); + std::vector d3eval_lapl_z(npts * nbe); + util::cuda_copy(eval.size(), d3eval_lapl_x.data(), tasks[i].d3bflapl_x); + util::cuda_copy(eval.size(), d3eval_lapl_y.data(), tasks[i].d3bflapl_y); + util::cuda_copy(eval.size(), d3eval_lapl_z.data(), tasks[i].d3bflapl_z); + check_collocation_transpose(npts, nbe, ref_d3eval_lapl_x, d3eval_lapl_x.data(), "IT = " + std::to_string(i) + "BFLAPL_X EVAL" ); + check_collocation_transpose(npts, nbe, ref_d3eval_lapl_y, d3eval_lapl_y.data(), "IT = " + std::to_string(i) + "BFLAPL_Y EVAL" ); + check_collocation_transpose(npts, nbe, ref_d3eval_lapl_z, d3eval_lapl_z.data(), "IT = " + std::to_string(i) + "BFLAPL_Z EVAL" ); + } +#endif + } } @@ -186,7 +228,7 @@ void test_cuda_collocation_masked_combined( const BasisSet& basis, std:: device_queue stream( std::make_shared() ); - auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, false ); + auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, false, false, false ); const auto nshells_max = std::max_element( tasks.begin(), tasks.end(), @@ -211,7 +253,7 @@ void test_cuda_collocation_masked_combined( const BasisSet& basis, std:: util::cuda_device_sync(); - cuda_check_collocation( tasks, ref_data, grad, false ); + cuda_check_collocation( tasks, ref_data, grad, false, false, false ); for( auto& t : tasks ) { @@ -249,7 +291,7 @@ void test_cuda_collocation_deriv1( const BasisSet& basis, void test_cuda_collocation_shell_to_task( const BasisSet& basis, const BasisSetMap& basis_map, - std::ifstream& in_file, bool grad, bool hess) { + std::ifstream& in_file, bool grad, bool hess, bool lapl, bool lapl_grad) { // Load reference data std::vector ref_data; @@ -260,7 +302,7 @@ void test_cuda_collocation_shell_to_task( const BasisSet& basis, const // Populate base task information device_queue stream( std::make_shared() ); - auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, hess ); + auto [shells_device,tasks] = populate_device_cuda( basis, ref_data, grad, hess, lapl, lapl_grad ); // Send tasks to device auto* tasks_device = util::cuda_malloc( tasks.size() ); @@ -355,9 +397,15 @@ void test_cuda_collocation_shell_to_task( const BasisSet& basis, const } - if( hess ) + if( lapl_grad ) + eval_collocation_shell_to_task_lapgrad( max_l, l_batched_shell_to_task.data(), + tasks_device, stream ); + else if( hess ) eval_collocation_shell_to_task_hessian( max_l, l_batched_shell_to_task.data(), tasks_device, stream ); + else if( lapl ) + eval_collocation_shell_to_task_laplacian( max_l, l_batched_shell_to_task.data(), + tasks_device, stream ); else if( grad ) eval_collocation_shell_to_task_gradient( max_l, l_batched_shell_to_task.data(), tasks_device, stream ); @@ -368,13 +416,15 @@ void test_cuda_collocation_shell_to_task( const BasisSet& basis, const util::cuda_device_sync(); - cuda_check_collocation( tasks, ref_data, grad, hess ); + cuda_check_collocation( tasks, ref_data, grad, hess, lapl, lapl_grad ); for( auto& t : tasks ) { util::cuda_free( t.points_x, t.points_y, t.points_z, t.bfn_screening.shell_offs, t.bfn_screening.shell_list, t.bf ); if(grad) util::cuda_free( t.dbfx, t.dbfy, t.dbfz ); if(hess) util::cuda_free( t.d2bfxx, t.d2bfxy, t.d2bfxz, t.d2bfyy, t.d2bfyz, t.d2bfzz ); + if(lapl) util::cuda_free( t.d2bflapl ); + if(lapl_grad) util::cuda_free( t.d3bflapl_x, t.d3bflapl_y, t.d3bflapl_z ); } util::cuda_free( tasks_device, shells_device, shell_to_task_device ); for( auto& s : shell_to_task ) { @@ -387,19 +437,33 @@ void test_cuda_collocation_shell_to_task( const BasisSet& basis, const void test_cuda_collocation_shell_to_task( const BasisSet& basis, const BasisSetMap& basis_map, std::ifstream& in_file) { - test_cuda_collocation_shell_to_task(basis,basis_map,in_file,false, false); + test_cuda_collocation_shell_to_task(basis,basis_map,in_file,false, false, false, false); } void test_cuda_collocation_shell_to_task_gradient( const BasisSet& basis, const BasisSetMap& basis_map, std::ifstream& in_file) { - test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, false); + test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, false, false, false); } void test_cuda_collocation_shell_to_task_hessian( const BasisSet& basis, const BasisSetMap& basis_map, std::ifstream& in_file) { - test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, true); + test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, true, false, false); + +} + +void test_cuda_collocation_shell_to_task_laplacian( const BasisSet& basis, + const BasisSetMap& basis_map, std::ifstream& in_file) { + + test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, false, true, false); + +} + +void test_cuda_collocation_shell_to_task_lapgrad( const BasisSet& basis, + const BasisSetMap& basis_map, std::ifstream& in_file) { + + test_cuda_collocation_shell_to_task(basis,basis_map,in_file,true, true, true, true); } diff --git a/tests/collocation_hip.hpp b/tests/collocation_hip.hpp index c6314aac..b6be897c 100644 --- a/tests/collocation_hip.hpp +++ b/tests/collocation_hip.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/collocation_host.hpp b/tests/collocation_host.hpp index a64ce8ee..52dcaec0 100644 --- a/tests/collocation_host.hpp +++ b/tests/collocation_host.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -46,13 +50,40 @@ void generate_collocation_data( const Molecule& mol, const BasisSet& bas d2eval_xz( nbf * npts ), d2eval_yy( nbf * npts ), d2eval_yz( nbf * npts ), - d2eval_zz( nbf * npts ); - - gau2grid_collocation_hessian( npts, mask.size(), nbf, + d2eval_zz( nbf * npts ), + d3eval_xxx( nbf * npts ), + d3eval_xxy( nbf * npts ), + d3eval_xxz( nbf * npts ), + d3eval_xyy( nbf * npts ), + d3eval_xyz( nbf * npts ), + d3eval_xzz( nbf * npts ), + d3eval_yyy( nbf * npts ), + d3eval_yyz( nbf * npts ), + d3eval_yzz( nbf * npts ), + d3eval_zzz( nbf * npts ); + + gau2grid_collocation_der3( npts, mask.size(), nbf, pts.data()->data(), basis, mask.data(), eval.data(), deval_x.data(), deval_y.data(), deval_z.data(), d2eval_xx.data(), d2eval_xy.data(), d2eval_xz.data(), - d2eval_yy.data(), d2eval_yz.data(), d2eval_zz.data() ); + d2eval_yy.data(), d2eval_yz.data(), d2eval_zz.data(), + d3eval_xxx.data(), d3eval_xxy.data(), d3eval_xxz.data(), + d3eval_xyy.data(), d3eval_xyz.data(), d3eval_xzz.data(), + d3eval_yyy.data(), d3eval_yyz.data(), d3eval_yzz.data(), + d3eval_zzz.data()); + + std::vector d2eval_lapl(nbf * npts); + std::vector d3eval_lapl_x(nbf * npts); + std::vector d3eval_lapl_y(nbf * npts); + std::vector d3eval_lapl_z(nbf * npts); + for(auto i = 0; i < nbf*npts; ++i) { + d2eval_lapl[i] = d2eval_xx[i] + d2eval_yy[i] + d2eval_zz[i]; + d3eval_lapl_x[i] = d3eval_xxx[i] + d3eval_xyy[i] + d3eval_xzz[i]; + d3eval_lapl_y[i] = d3eval_xxy[i] + d3eval_yyy[i] + d3eval_yzz[i]; + d3eval_lapl_z[i] = d3eval_xxz[i] + d3eval_yyz[i] + d3eval_zzz[i]; + } + + auto max_abs = *std::max_element( eval.begin(), eval.end(), [](auto a, auto b){ return std::abs(a) < std::abs(b); } ); @@ -61,7 +92,9 @@ void generate_collocation_data( const Molecule& mol, const BasisSet& bas ref_collocation_data d{ std::move(mask), std::move(pts), std::move(eval), std::move(deval_x), std::move(deval_y), std::move(deval_z), std::move(d2eval_xx), std::move(d2eval_xy), std::move(d2eval_xz), - std::move(d2eval_yy), std::move(d2eval_yz), std::move(d2eval_zz) + std::move(d2eval_yy), std::move(d2eval_yz), std::move(d2eval_zz), + std::move(d2eval_lapl), std::move(d3eval_lapl_x), std::move(d3eval_lapl_y), + std::move(d3eval_lapl_z) }; ref_data.emplace_back( std::move(d) ); diff --git a/tests/conv_cereal_to_hdf5.cxx b/tests/conv_cereal_to_hdf5.cxx index d717cf63..682a6964 100644 --- a/tests/conv_cereal_to_hdf5.cxx +++ b/tests/conv_cereal_to_hdf5.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/dd_psi_potential_test.cxx b/tests/dd_psi_potential_test.cxx new file mode 100644 index 00000000..9af2844e --- /dev/null +++ b/tests/dd_psi_potential_test.cxx @@ -0,0 +1,102 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include "ut_common.hpp" +#include +#include +#include +#include + +#include + +#include +#include +#include + +using namespace GauXC; + +void test_dd_psi ( + std::string reference_file, + int lmax = 8 +) { + using matrix_type = Eigen::MatrixXd; + Molecule mol; + BasisSet basis; + matrix_type P, ddX, ddPsi_ref, ddPsi_potential_ref; + + read_hdf5_record( mol, reference_file, "/MOLECULE" ); + read_hdf5_record( basis, reference_file, "/BASIS" ); + + HighFive::File file( reference_file, HighFive::File::ReadOnly ); + std::string den_str = "/DENSITY"; + auto dset = file.getDataSet(den_str); + auto dims = dset.getDimensions(); + P = matrix_type( dims[0], dims[1] ); + dset.read( P.data() ); + + int nharmonics = (lmax + 1) * (lmax + 1); + + ddX = matrix_type( nharmonics, mol.size() ); + dset = file.getDataSet("/DD_X"); + dset.read(ddX.data()); + + ddPsi_ref = matrix_type( mol.size(), nharmonics ); + dset = file.getDataSet("/DD_PSI"); + dset.read( ddPsi_ref.data()); + + ddPsi_potential_ref = matrix_type( basis.nbf(), basis.nbf() ); + dset = file.getDataSet("/DD_PSI_POTENTIAL"); + dset.read( ddPsi_potential_ref.data() ); + + + #ifdef GAUXC_HAS_DEVICE + auto rt = DeviceRuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD,) 0.9); + #else + auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + #endif + + auto mg = MolGridFactory::create_default_molgrid(mol, PruningScheme::Unpruned, + BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid); + + auto ex = ExecutionSpace::Host; + LoadBalancerFactory lb_factory(ex, "Default"); + auto lb = lb_factory.get_instance(rt, mol, mg, basis); + + // Construct Weights Module + MolecularWeightsFactory mw_factory( ex, "Default", MolecularWeightsSettings{} ); + auto mw = mw_factory.get_instance(); + + // Apply partition weights + mw.modify_weights(lb); + + functional_type func = functional_type( ExchCXX::Backend::builtin, ExchCXX::Functional::PBE0, ExchCXX::Spin::Unpolarized ); + // Construct XCIntegrator + XCIntegratorFactory integrator_factory( ex, "Replicated", + "Default", "Default", "Default" ); + auto integrator = integrator_factory.get_instance( func, lb ); + + auto dd_psi = integrator.eval_dd_psi(P, lmax); + auto ddPsi = Eigen::Map(dd_psi.data(), mol.size(), nharmonics); + auto ddPsi_nrm = (ddPsi - ddPsi_ref).norm(); + CHECK( ddPsi_nrm / mol.size() < 1e-10 ); + + auto ddPsiPotential = integrator.eval_dd_psi_potential(ddX, lmax); + auto ddPsiPotential_nrm = (ddPsiPotential - ddPsi_potential_ref).norm(); + CHECK( ddPsiPotential_nrm / basis.nbf() < 1e-10 ); + +} + +TEST_CASE( "DD PSI & PSI POTENTIAL", "[dd]" ) { + SECTION( " C2H4 / def2-svp / LMAX = 8" ) { + test_dd_psi( GAUXC_REF_DATA_PATH "/c2h4_l8_dd_psi_potential.hdf5" ); + } +} + \ No newline at end of file diff --git a/tests/eigen3_matrix_serialization.hpp b/tests/eigen3_matrix_serialization.hpp index 38537170..a810e8d1 100644 --- a/tests/eigen3_matrix_serialization.hpp +++ b/tests/eigen3_matrix_serialization.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/environment.cxx b/tests/environment.cxx index ae9ec82b..2dea8138 100644 --- a/tests/environment.cxx +++ b/tests/environment.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/grid_opt.cxx b/tests/grid_opt.cxx index ced74d20..1fa2237f 100644 --- a/tests/grid_opt.cxx +++ b/tests/grid_opt.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/grid_test.cxx b/tests/grid_test.cxx index 4e8782e6..c308adf8 100644 --- a/tests/grid_test.cxx +++ b/tests/grid_test.cxx @@ -1,16 +1,20 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ #include "catch2/catch.hpp" #include -#include -#include -#include +#include +#include +#include #include #include diff --git a/tests/ini_input.cxx b/tests/ini_input.cxx index 972eeaaa..a5f6ed56 100644 --- a/tests/ini_input.cxx +++ b/tests/ini_input.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/ini_input.hpp b/tests/ini_input.hpp index 1577f2ee..6be84086 100644 --- a/tests/ini_input.hpp +++ b/tests/ini_input.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/load_balancer_test.cxx b/tests/load_balancer_test.cxx index 1c55f3e8..889bcb30 100644 --- a/tests/load_balancer_test.cxx +++ b/tests/load_balancer_test.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/molgrid_test.cxx b/tests/molgrid_test.cxx index 191b8e7b..1de1be98 100644 --- a/tests/molgrid_test.cxx +++ b/tests/molgrid_test.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/moltypes_test.cxx b/tests/moltypes_test.cxx index 5ed00c55..d87685f7 100644 --- a/tests/moltypes_test.cxx +++ b/tests/moltypes_test.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/ref_data/benzene_m062x_def2-svp_ufg_ssf.hdf5 b/tests/ref_data/benzene_m062x_def2-svp_ufg_ssf.hdf5 new file mode 100644 index 00000000..e4eebc23 Binary files /dev/null and b/tests/ref_data/benzene_m062x_def2-svp_ufg_ssf.hdf5 differ diff --git a/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 b/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 index c1d3ebc3..51bfa6ac 100644 Binary files a/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 and b/tests/ref_data/benzene_pbe0_cc-pvdz_ufg_ssf.hdf5 differ diff --git a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 index 9acf2378..3bf4d5fa 100644 Binary files a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 and b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5 differ diff --git a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 index f7242869..61a765bd 100644 Binary files a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 and b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 differ diff --git a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 index 478d988e..c021e3c8 100644 Binary files a/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 and b/tests/ref_data/benzene_svwn5_cc-pvdz_ufg_ssf_treutler_prune.hdf5 differ diff --git a/tests/ref_data/c2h4_l8_dd_psi_potential.hdf5 b/tests/ref_data/c2h4_l8_dd_psi_potential.hdf5 new file mode 100644 index 00000000..2187f79d Binary files /dev/null and b/tests/ref_data/c2h4_l8_dd_psi_potential.hdf5 differ diff --git a/tests/ref_data/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5 b/tests/ref_data/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5 new file mode 100644 index 00000000..3fc56cda Binary files /dev/null and b/tests/ref_data/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5 differ diff --git a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 index d44d7c0f..06cf00ff 100644 Binary files a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 and b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5 differ diff --git a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 index 829f8e96..53ca387a 100644 Binary files a/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 and b/tests/ref_data/cytosine_scan_cc-pvdz_ufg_ssf_robust_uks.hdf5 differ diff --git a/tests/ref_data/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5 b/tests/ref_data/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5 new file mode 100644 index 00000000..3496bd81 Binary files /dev/null and b/tests/ref_data/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5 differ diff --git a/tests/ref_data/li_svwn5_sto3g_uks.bin b/tests/ref_data/li_svwn5_sto3g_uks.bin index b654d996..d96aeba2 100644 Binary files a/tests/ref_data/li_svwn5_sto3g_uks.bin and b/tests/ref_data/li_svwn5_sto3g_uks.bin differ diff --git a/tests/ref_data/ut_input.inp b/tests/ref_data/ut_input.inp index 46901ab1..4f0455fc 100644 --- a/tests/ref_data/ut_input.inp +++ b/tests/ref_data/ut_input.inp @@ -8,4 +8,5 @@ func = svwn5 integrate_vxc = TRUE integrate_exc_grad = TRUE integrate_exx = FALSE +integrate_fxc_contraction = FALSE OUTFILE = benzene_svwn5_cc-pvdz_ufg_ssf_robust_prune.hdf5 diff --git a/tests/ref_data/water_cc-pVDZ_collocation.bin b/tests/ref_data/water_cc-pVDZ_collocation.bin index c6d22ab4..e2d7ea60 100644 Binary files a/tests/ref_data/water_cc-pVDZ_collocation.bin and b/tests/ref_data/water_cc-pVDZ_collocation.bin differ diff --git a/tests/runtime.cxx b/tests/runtime.cxx index 1f8b6933..5b459940 100644 --- a/tests/runtime.cxx +++ b/tests/runtime.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/standalone_driver.cxx b/tests/standalone_driver.cxx index efc9ce59..68a9c13a 100644 --- a/tests/standalone_driver.cxx +++ b/tests/standalone_driver.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -51,6 +55,7 @@ int main(int argc, char** argv) { // Optional Args std::string grid_spec = "ULTRAFINE"; + std::string rad_quad_spec = "MURAKNOWLES"; std::string prune_spec = "UNPRUNED"; std::string lb_exec_space_str = "Host"; std::string int_exec_space_str = "Host"; @@ -66,6 +71,10 @@ int main(int argc, char** argv) { bool integrate_vxc = true; bool integrate_exx = false; bool integrate_exc_grad = false; + bool integrate_dd_psi = false; + bool integrate_dd_psi_potential = false; + bool integrate_fxc_contraction = false; + int lmax = 2; auto string_to_upper = []( auto& str ) { std::transform( str.begin(), str.end(), str.begin(), ::toupper ); @@ -79,6 +88,7 @@ int main(int argc, char** argv) { OPTIONAL_KEYWORD( "GAUXC.GRID", grid_spec, std::string ); OPTIONAL_KEYWORD( "GAUXC.FUNC", func_spec, std::string ); OPTIONAL_KEYWORD( "GAUXC.PRUNING_SCHEME", prune_spec, std::string ); + OPTIONAL_KEYWORD( "GAUXC.RAD_QUAD", rad_quad_spec, std::string ); OPTIONAL_KEYWORD( "GAUXC.LB_EXEC_SPACE", lb_exec_space_str, std::string ); OPTIONAL_KEYWORD( "GAUXC.INT_EXEC_SPACE", int_exec_space_str, std::string ); OPTIONAL_KEYWORD( "GAUXC.INTEGRATOR_KERNEL", integrator_kernel, std::string ); @@ -86,6 +96,7 @@ int main(int argc, char** argv) { OPTIONAL_KEYWORD( "GAUXC.REDUCTION_KERNEL", reduction_kernel, std::string ); string_to_upper( grid_spec ); string_to_upper( func_spec ); + string_to_upper( rad_quad_spec ); string_to_upper( prune_spec ); string_to_upper( lb_exec_space_str ); string_to_upper( int_exec_space_str ); @@ -100,6 +111,10 @@ int main(int argc, char** argv) { OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_VXC", integrate_vxc, bool ); OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_EXX", integrate_exx, bool ); OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_EXC_GRAD", integrate_exc_grad, bool ); + OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_DD_PSI", integrate_dd_psi, bool ); + OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_DD_PSI_POTENTIAL", integrate_dd_psi_potential, bool ); + OPTIONAL_KEYWORD( "GAUXC.INTEGRATE_FXC_CONTRACTION", integrate_fxc_contraction, bool ); + OPTIONAL_KEYWORD( "GAUXC.MAX_YLM", lmax, int ); IntegratorSettingsSNLinK sn_link_settings; OPTIONAL_KEYWORD( "EXX.TOL_E", sn_link_settings.energy_tol, double ); @@ -124,6 +139,7 @@ int main(int argc, char** argv) { std::cout << "DRIVER SETTINGS: " << std::endl << " REF_FILE = " << ref_file << std::endl << " GRID = " << grid_spec << std::endl + << " RAD_QUAD = " << rad_quad_spec << std::endl << " PRUNING_SCHEME = " << prune_spec << std::endl << " BATCH_SIZE = " << batch_size << std::endl << " BASIS_TOL = " << basis_tol << std::endl @@ -136,13 +152,19 @@ int main(int argc, char** argv) { << " DEN (?) = " << integrate_den << std::endl << " VXC (?) = " << integrate_vxc << std::endl << " EXX (?) = " << integrate_exx << std::endl - << " EXC_GRAD (?) = " << integrate_exc_grad << std::endl; + << " EXC_GRAD (?) = " << integrate_exc_grad << std::endl + << " DD_PSI (?) = " << integrate_dd_psi << std::endl + << " DD_PSI_POTENTIAL (?) = " << integrate_dd_psi_potential << std::endl + << " FXC_CONTRACTION (?) = " << integrate_fxc_contraction << std::endl; if(integrate_exx) { std::cout << " EXX.TOL_E = " << sn_link_settings.energy_tol << std::endl << " EXX.TOL_K = " << sn_link_settings.k_tol << std::endl; } + if (integrate_dd_psi || integrate_dd_psi_potential) { + std::cout << " DD_MAX_YLM = " << lmax << std::endl; + } std::cout << std::endl; } @@ -170,9 +192,19 @@ int main(int argc, char** argv) { {"TREUTLER", PruningScheme::Treutler} }; + std::map< std::string, RadialQuad > rad_quad_map = { + {"BECKE", RadialQuad::Becke}, + {"MURAKNOWLES", RadialQuad::MuraKnowles}, + {"TREUTLERAHLRICHS", RadialQuad::TreutlerAhlrichs}, + {"MURRAYHANDYLAMING", RadialQuad::MurrayHandyLaming}, + {"MK", RadialQuad::MuraKnowles}, + {"TA", RadialQuad::TreutlerAhlrichs}, + {"MHL", RadialQuad::MurrayHandyLaming} + }; + auto mg = MolGridFactory::create_default_molgrid(mol, prune_map.at(prune_spec), BatchSize(batch_size), - RadialQuad::MuraKnowles, mg_map.at(grid_spec)); + rad_quad_map.at(rad_quad_spec), mg_map.at(grid_spec)); // Read BasisSet BasisSet basis; @@ -195,6 +227,8 @@ int main(int argc, char** argv) { using matrix_type = Eigen::MatrixXd; // Read in reference data matrix_type P, Pz, Py, Px, VXC_ref, VXCz_ref, VXCy_ref, VXCx_ref, K_ref; + matrix_type ddX, ddPsi_ref, ddPsi_potential_ref; + matrix_type FXC_ref, FXCz_ref; double EXC_ref; std::vector EXC_GRAD_ref(3*mol.size()); bool rks = true, uks = false, gks = false; @@ -325,7 +359,69 @@ int main(int argc, char** argv) { K_ref.fill(0); } } + if ( integrate_dd_psi ) { + int nharmonics = (lmax + 1) * (lmax + 1); + ddPsi_ref = matrix_type( mol.size(), nharmonics ); + try { + dset = file.getDataSet("/DD_PSI"); + dset.read( ddPsi_ref.data()); + auto dd_psi_dims = dset.getDimensions(); + if (dd_psi_dims[0] != mol.size() or dd_psi_dims[1] != nharmonics) + GAUXC_GENERIC_EXCEPTION("Incorrect dims for DD_PSI"); + } catch(...) { + if(world_rank == 0) { + std::cout << "** Warning: Could Not Find Reference DD_PSI" << std::endl; + } + ddPsi_ref.fill(0); + } + } + + if ( integrate_dd_psi_potential ) { + int nharmonics = (lmax + 1) * (lmax + 1); + ddX = matrix_type( nharmonics, mol.size() ); + ddPsi_potential_ref = matrix_type( basis.nbf(), basis.nbf() ); + try { + dset = file.getDataSet("/DD_X"); + auto dd_x_dims = dset.getDimensions(); + if (dd_x_dims[0] != nharmonics or dd_x_dims[1] != mol.size()) + GAUXC_GENERIC_EXCEPTION("Incorrect dims for DD_X"); + dset.read(ddX.data()); + } catch(...) { + throw std::runtime_error("Could Not Find Input DD_X for DD_PSI_POTENTIAL"); + } + try { + dset = file.getDataSet("/DD_PSI_POTENTIAL"); + auto dd_psi_potential_dims = dset.getDimensions(); + if (dd_psi_potential_dims[0] != basis.nbf() or dd_psi_potential_dims[1] != basis.nbf()) + GAUXC_GENERIC_EXCEPTION("Incorrect dims for DD_PSI_POTENTIAL"); + dset.read(ddPsi_potential_ref.data()); + } catch(...) { + if(world_rank == 0) { + std::cout << "** Warning: Could Not Find Reference DD_PSI_POTENTIAL" << std::endl; + } + ddPsi_potential_ref.fill(0); + } + } + if ( integrate_fxc_contraction ) { + try { + dset = file.getDataSet("/FXC"); + auto fxc_dims = dset.getDimensions(); + FXC_ref = matrix_type( fxc_dims[0], fxc_dims[1] ); + dset.read( FXC_ref.data() ); + if( not rks ) { + dset = file.getDataSet("/FXC_Z"); + FXCz_ref = matrix_type( fxc_dims[0], fxc_dims[1] ); + dset.read( FXCz_ref.data() ); + } + } catch(...) { + if(world_rank == 0) { + std::cout << "** Warning: Could Not Find Reference FXC" << std::endl; + } + FXC_ref.fill(0); + if( not rks ) FXCz_ref.fill(0); + } + } } // Setup XC functional auto polar = (uks or gks) ? Spin::Polarized : Spin::Unpolarized; @@ -333,7 +429,9 @@ int main(int argc, char** argv) { if(functional_map.key_exists(func_spec)) { func = functional_type( Backend::builtin, functional_map.value(func_spec), polar ); - } else { + } +#ifdef EXCHCXX_ENABLE_LIBXC + else { std::vector> funcs; std::vector libxc_names; split(libxc_names, func_spec, ","); @@ -342,6 +440,7 @@ int main(int argc, char** argv) { } func = functional_type(funcs); } +#endif // Setup Integrator XCIntegratorFactory integrator_factory( int_exec_space , @@ -353,7 +452,8 @@ int main(int argc, char** argv) { #endif auto xc_int_start = std::chrono::high_resolution_clock::now(); - matrix_type VXC, VXCz, VXCy, VXCx, K; + matrix_type VXC, VXCz, VXCy, VXCx, K, FXC, FXCz; + matrix_type ddPsi, ddPsiPotential; double EXC, N_EL; std::cout << std::scientific << std::setprecision(12); @@ -397,8 +497,7 @@ int main(int argc, char** argv) { EXC_GRAD = integrator.eval_exc_grad( P ); } else if( uks ) { - std::cout << "Warning: eval_exc_grad + UKS NYI!" << std::endl; - //EXC_GRAD = integrator.eval_exc_grad( P, Pz ); + EXC_GRAD = integrator.eval_exc_grad( P, Pz ); } else if( gks ) { std::cout << "Warning: eval_exc_grad + GKS NYI!" << std::endl; @@ -417,12 +516,94 @@ int main(int argc, char** argv) { } } + // Load trial density matrices for FXC contraction + matrix_type tP, tPz; + if( integrate_fxc_contraction ) { + bool create_trial_densities = false; + { + // Try to load trial density matrices from reference file + HighFive::File file( ref_file, HighFive::File::ReadOnly ); + std::string tden_str = "/TRIAL_DENSITY"; + std::string fxc_str = "/FXC"; + + if (!rks) { + tden_str = "/TRIAL_DENSITY_SCALAR"; + fxc_str = "/FXC_SCALAR"; + } + + try { + auto dset = file.getDataSet(tden_str); + auto dims = dset.getDimensions(); + tP = matrix_type(dims[0], dims[1]); + dset.read(tP.data()); + + if (!rks) { + dset = file.getDataSet("/TRIAL_DENSITY_Z"); + tPz = matrix_type(dims[0], dims[1]); + dset.read(tPz.data()); + } + + // Also try to read reference FXC matrices if available + try { + dset = file.getDataSet(fxc_str); + FXC_ref = matrix_type(dims[0], dims[1]); + dset.read(FXC_ref.data()); + + if (!rks) { + dset = file.getDataSet("/FXC_Z"); + FXCz_ref = matrix_type(dims[0], dims[1]); + dset.read(FXCz_ref.data()); + } + } catch(...) { + if(world_rank == 0) { + std::cout << "** Warning: Could Not Find Reference FXC" << std::endl; + } + FXC_ref.fill(0); + if(!rks) FXCz_ref.fill(0); + } + + } catch(...) { + if(world_rank == 0) { + std::cout << "** Trial density matrices not found, generating random symmetric matrices..." << std::endl; + create_trial_densities = true; + } + } + + } + + if(!world_rank) { + std::cout << "Computing FXC contraction..." << std::endl; + } + + // Compute FXC contraction + if( rks ) { + FXC = integrator.eval_fxc_contraction( P, tP, IntegratorSettingsXC{} ); + } else if( uks ) { + std::tie(FXC, FXCz) = integrator.eval_fxc_contraction( P, Pz, tP, tPz, IntegratorSettingsXC{} ); + } else if( gks ) { + std::cout << "Warning: FXC contraction with GKS NYI!" << std::endl; + } + + } + if( integrate_exx ) { K = integrator.eval_exx(P, sn_link_settings); //matrix_type K_tmp = 0.5 * (K + K.transpose()); //K = -K_tmp; } else { K = K_ref; } + + if( integrate_dd_psi ) { + size_t Ylm_sz = (lmax + 1) * ( lmax + 1); + auto dd_psi = integrator.eval_dd_psi(P, lmax); + ddPsi = Eigen::Map(dd_psi.data(), mol.size(), Ylm_sz); + } else { ddPsi = ddPsi_ref; } + + if (integrate_dd_psi_potential) { + ddPsiPotential = integrator.eval_dd_psi_potential(ddX, lmax); + } else { ddPsiPotential = ddPsi_potential_ref; } + + #ifdef GAUXC_HAS_MPI MPI_Barrier( MPI_COMM_WORLD ); #endif @@ -560,6 +741,26 @@ int main(int argc, char** argv) { std::cout << "RMS K Diff = " << (K_ref - K).norm() / basis.nbf() << std::endl; } + if (integrate_dd_psi) { + std::cout << "| DD_PSI (ref) |_F = " << ddPsi_ref.norm() << std::endl; + std::cout << "| DD_PSI (calc) |_F = " << ddPsi.norm() << std::endl; + std::cout << "RMS DD_PSI Diff = " << (ddPsi_ref - ddPsi).norm() / mol.size() << std::endl; + } + if (integrate_dd_psi_potential) { + std::cout << "| DD_PSI_POTENTIAL (ref) |_F = " << ddPsi_potential_ref.norm() << std::endl; + std::cout << "| DD_PSI_POTENTIAL (calc) |_F = " << ddPsiPotential.norm() << std::endl; + std::cout << "RMS DD_PSI_POTENTIAL Diff = " << (ddPsi_potential_ref - ddPsiPotential).norm() / basis.nbf() << std::endl; + } + if (integrate_fxc_contraction) { + std::cout << "| FXC (ref) |_F = " << FXC_ref.norm() << std::endl; + std::cout << "| FXC (calc) |_F = " << FXC.norm() << std::endl; + std::cout << "RMS FXC Diff = " << (FXC_ref - FXC).norm() / basis.nbf() << std::endl; + if (not rks) { + std::cout << "| FXCz (ref) |_F = " << FXCz_ref.norm() << std::endl; + std::cout << "| FXCz (calc) |_F = " << FXCz.norm() << std::endl; + std::cout << "RMS FXCz Diff = " << (FXCz_ref - FXCz).norm() / basis.nbf() << std::endl; + } + } } // Dump out new file @@ -625,6 +826,27 @@ int main(int argc, char** argv) { dset = file.createDataSet( "/EXC_GRAD", grad_space ); dset.write_raw( EXC_GRAD.data() ); } + + if (integrate_dd_psi) { + HighFive::DataSpace dd_psi_space( mol.size(), (lmax + 1) * (lmax + 1) ); + dset = file.createDataSet("/DD_PSI", dd_psi_space); + dset.write_raw(ddPsi.data()); + } + + if (integrate_dd_psi_potential) { + HighFive::DataSpace dd_psi_potential_space(basis.nbf(), basis.nbf()); + dset = file.createDataSet("/DD_PSI_POTENTIAL", dd_psi_potential_space); + dset.write_raw(ddPsiPotential.data()); + } + + if (integrate_fxc_contraction) { + dset = file.createDataSet("/FXC" + ugks_scalar, mat_space); + dset.write_raw(FXC.data()); + if (not rks) { + dset = file.createDataSet("/FXC_Z", mat_space); + dset.write_raw(FXCz.data()); + } + } } } diff --git a/tests/standards.cxx b/tests/standards.cxx index 6ca6473a..170e73ee 100644 --- a/tests/standards.cxx +++ b/tests/standards.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -1406,7 +1410,7 @@ Molecule make_ubiquitin() { BasisSet make_631Gd( const Molecule& mol, SphericalType sph ) { - std::string basis_path = GAUXC_REF_DATA_PATH "/../basis/old/6-31g*.g94"; + std::string basis_path = GAUXC_REF_DATA_PATH "/../basis/old/6-31g-star.g94"; return parse_basis( mol, basis_path, sph ); } diff --git a/tests/standards.hpp b/tests/standards.hpp index a9db6759..93a6b298 100644 --- a/tests/standards.hpp +++ b/tests/standards.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/ut_common.hpp.in b/tests/ut_common.hpp.in index 628ef5f4..6c0c00a0 100644 --- a/tests/ut_common.hpp.in +++ b/tests/ut_common.hpp.in @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/ut_main.cxx b/tests/ut_main.cxx index 0ccd3be1..75420515 100644 --- a/tests/ut_main.cxx +++ b/tests/ut_main.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -12,10 +16,18 @@ #ifdef GAUXC_HAS_MPI #include #endif +#ifdef GAUXC_HAS_CUDA +#include +#endif int main( int argc, char* argv[] ) { #ifdef GAUXC_HAS_MPI MPI_Init(&argc, &argv); + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); +#ifdef GAUXC_HAS_CUDA + cudaSetDevice(rank); +#endif int result = Catch::Session().run( argc, argv ); MPI_Finalize(); #else diff --git a/tests/weight_derivative_test.cxx b/tests/weight_derivative_test.cxx new file mode 100644 index 00000000..ec53daf8 --- /dev/null +++ b/tests/weight_derivative_test.cxx @@ -0,0 +1,398 @@ +/** + * GauXC Copyright (c) 2020-2024, The Regents of the University of California, + * through Lawrence Berkeley National Laboratory (subject to receipt of + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. + * + * See LICENSE.txt for details + */ +#include "ut_common.hpp" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Include weights implementation +#include "xc_integrator/local_work_driver/host/reference/weights.hpp" + +using namespace GauXC; + +// Helper function to compute weights for a task +void compute_weights_task(XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, XCTask& task) { + // Construct local work driver + auto lwd = LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace::Host, "Default", LocalWorkSettings() ); + auto* lwd_host = dynamic_cast(lwd.get()); + + std::vector tasks = {task}; + lwd_host->partition_weights(weight_alg, mol, meta, tasks.begin(), tasks.end()); + + // Copy the computed weights back to the original task + task.weights = tasks[0].weights; +} + +// Helper function to compute weights for a task +void compute_int(XCWeightAlg weight_alg, const Molecule& mol, const MolMeta& meta, XCTask& task, + double* f_eval, double* result) { + std::vector tasks = {task}; + + auto lwd = LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace::Host, "Default", LocalWorkSettings() ); + auto* lwd_host = dynamic_cast(lwd.get()); + lwd_host->partition_weights(weight_alg, mol, meta, tasks.begin(), tasks.end()); + + for (size_t i = 0; i < task.points.size(); i++) { + result[0] += tasks[0].weights[i] * f_eval[i]; + } +} + + +// Test function that reads molecule and basis from reference file +void test_weight_1st_deri_host_fdiff(const std::string& reference_file, XCWeightAlg weight_alg, + PruningScheme pruning_scheme, double fdiff_step, double fdiff_tolerance) { + + // Create runtime environment + auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + Molecule mol; + BasisSet basis; + + // Read molecule and basis from HDF5 reference file + read_hdf5_record(mol, reference_file, "/MOLECULE"); + read_hdf5_record(basis, reference_file, "/BASIS"); + + // Set shell tolerance for numerical stability + for(auto& sh : basis) { + sh.set_shell_tolerance(std::numeric_limits::epsilon()); + } + auto mg = MolGridFactory::create_default_molgrid(mol, pruning_scheme, + BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid); + + // Construct Load Balancer + LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default"); + auto lb = lb_factory.get_instance(rt, mol, mg, basis); + + + // Get all XC tasks + auto& tasks = lb.get_tasks(); + size_t natoms = mol.size(); + size_t ntask = tasks.size(); + + auto get_xyz_pointer = [](Atom& atom, size_t i_coord) { + switch(i_coord) { + case 0: return &atom.x; // X coordinate + case 1: return &atom.y; // Y coordinate + case 2: return &atom.z; // Z coordinate + default: throw std::out_of_range("Invalid coordinate index"); + } + }; + + // Calculate finite difference derivatives as ref + std::vector> weight_derivatives_ref(ntask); + for(size_t i_task = 0; i_task < ntask; i_task++) { + weight_derivatives_ref[i_task].resize(3 * natoms * tasks[i_task].npts); + } + for( size_t i_atom = 0; i_atom < mol.size(); i_atom++ ) { + for( size_t i_coord = 0; i_coord < 3; i_coord++ ) { + // Create perturbed molecules + Molecule mol_plus = mol; + Molecule mol_minus = mol; + + // Perturb atom coordinates + double* coord_ptr_plus = get_xyz_pointer(mol_plus[i_atom], i_coord); + double* coord_ptr_minus = get_xyz_pointer(mol_minus[i_atom], i_coord); + double delta = fdiff_step; // Use provided finite difference step + *coord_ptr_plus += delta; // Perturb in positive direction + *coord_ptr_minus -= delta; // Perturb in negative direction + + // Create metadata for perturbed molecules + MolMeta meta_plus(mol_plus); + MolMeta meta_minus(mol_minus); + + // Compute weights for perturbed geometries + for(size_t itask = 0; itask < ntask; itask++) { + XCTask task_plus = tasks[itask]; + XCTask task_minus = tasks[itask]; + if (i_atom == (size_t)task_plus.iParent) { + for(size_t ipt = 0; ipt < task_plus.npts; ipt++) { + task_plus.points[ipt][i_coord] += delta; + task_minus.points[ipt][i_coord] -= delta; + } + } + task_plus.dist_nearest = meta_plus.dist_nearest()[task_plus.iParent]; + task_minus.dist_nearest = meta_minus.dist_nearest()[task_minus.iParent]; + + // Compute weights for perturbed geometries + compute_weights_task(weight_alg, mol_plus, meta_plus, task_plus); + compute_weights_task(weight_alg, mol_minus, meta_minus, task_minus); + + // Compute centered finite difference + for(size_t ipt = 0; ipt < task_plus.npts; ipt++) { + weight_derivatives_ref[itask][3 * natoms * ipt + 3 * i_atom + i_coord] = + (task_plus.weights[ipt] - task_minus.weights[ipt]) / (2.0 * delta); + } + } + } + } + + + // Test derivatives for all tasks + for(size_t task_idx = 0; task_idx < ntask; task_idx++) { + auto& task = tasks[task_idx]; + + INFO("Testing task " << task_idx << " with " << task.npts << " points"); + + // Create MolMeta + MolMeta meta(mol); // Compute analytical derivatives + std::vector analytical_derivatives(3 * natoms * task.npts); + compute_weights_task(weight_alg, mol, meta, task); + + switch( weight_alg ) { + case XCWeightAlg::Becke: + reference_becke_weights_1st_derivative_host(mol, meta, task, analytical_derivatives.data()); + break; + case XCWeightAlg::SSF: + reference_ssf_weights_1st_derivative_host(mol, meta, task, analytical_derivatives.data()); + break; + default: + GAUXC_GENERIC_EXCEPTION("Weight Alg Not Supported"); + } + + // Compare with numerical derivatives + double max_error = 0.0; + for(size_t ipt = 0; ipt < task.npts; ipt++) { + for(size_t iatom = 0; iatom < natoms; iatom++) { + for(size_t icoord = 0; icoord < 3; icoord++) { + size_t idx = 3 * natoms * ipt + 3 * iatom + icoord; + double error = std::abs(analytical_derivatives[idx] - weight_derivatives_ref[task_idx][idx]); + max_error = std::max(max_error, error); + + INFO("Task " << task_idx << ", Point " << ipt << ", Atom " << iatom << ", Coord " << icoord + << " iParent: " << task.iParent); + INFO("Analytical: " << analytical_derivatives[idx]); + INFO("Numerical: " << weight_derivatives_ref[task_idx][idx]); + INFO("Error: " << error); + + REQUIRE(analytical_derivatives[idx] == Approx(weight_derivatives_ref[task_idx][idx]).margin(fdiff_tolerance)); + + } + } + } + + // Report statistics for this task + INFO("Task " << task_idx << " - Total derivatives tested: " << (task.npts * natoms * 3)); + INFO("Task " << task_idx << " - Maximum error: " << max_error); + } + + +} + + + +// Test function that reads molecule and basis from reference file +void test_weight_1st_deri_host_fdiff_contracted(const std::string& reference_file, XCWeightAlg weight_alg, + PruningScheme pruning_scheme, double fdiff_step, double fdiff_tolerance) { + + // Create runtime environment + auto rt = RuntimeEnvironment(GAUXC_MPI_CODE(MPI_COMM_WORLD)); + Molecule mol; + BasisSet basis; + + // Read molecule and basis from HDF5 reference file + read_hdf5_record(mol, reference_file, "/MOLECULE"); + read_hdf5_record(basis, reference_file, "/BASIS"); + + // Set shell tolerance for numerical stability + for(auto& sh : basis) { + sh.set_shell_tolerance(std::numeric_limits::epsilon()); + } + auto mg = MolGridFactory::create_default_molgrid(mol, pruning_scheme, + BatchSize(512), RadialQuad::MuraKnowles, AtomicGridSizeDefault::UltraFineGrid); + + // Construct Load Balancer + LoadBalancerFactory lb_factory(ExecutionSpace::Host, "Default"); + auto lb = lb_factory.get_instance(rt, mol, mg, basis); + + // Get all XC tasks + auto& tasks = lb.get_tasks(); + size_t natoms = mol.size(); + size_t ntask = tasks.size(); + + // Sort tasks on size (XXX: maybe doesnt matter?) + auto task_comparator = []( const XCTask& a, const XCTask& b ) { + return (a.points.size() * a.bfn_screening.nbe) > (b.points.size() * b.bfn_screening.nbe); + }; + std::stable_sort( tasks.begin(), tasks.end(), task_comparator ); + + // generate a random f_eval vector + std::vector> f_evals(ntask); + for(size_t i_task = 0; i_task < ntask; i_task++) { + f_evals[i_task].resize(tasks[i_task].npts); + for(size_t i_pt = 0; i_pt < tasks[i_task].npts; i_pt++) { + f_evals[i_task][i_pt] = static_cast(rand()) / RAND_MAX; // Random value between 0 and 1 + } + } + + + auto get_xyz_pointer = [](Atom& atom, size_t i_coord) { + switch(i_coord) { + case 0: return &atom.x; // X coordinate + case 1: return &atom.y; // Y coordinate + case 2: return &atom.z; // Z coordinate + default: throw std::out_of_range("Invalid coordinate index"); + } + }; + + // Calculate finite difference derivatives as ref + std::vector> exc_grad_w_ref(ntask); + for(size_t i_task = 0; i_task < ntask; i_task++) { + exc_grad_w_ref[i_task].resize(3 * natoms); + } + for( size_t i_atom = 0; i_atom < mol.size(); i_atom++ ) { + for( size_t i_coord = 0; i_coord < 3; i_coord++ ) { + // Create perturbed molecules + Molecule mol_plus = mol; + Molecule mol_minus = mol; + + // Perturb atom coordinates + double* coord_ptr_plus = get_xyz_pointer(mol_plus[i_atom], i_coord); + double* coord_ptr_minus = get_xyz_pointer(mol_minus[i_atom], i_coord); + double delta = fdiff_step; // Use provided finite difference step + *coord_ptr_plus += delta; // Perturb in positive direction + *coord_ptr_minus -= delta; // Perturb in negative direction + + // Create metadata for perturbed molecules + MolMeta meta_plus(mol_plus); + MolMeta meta_minus(mol_minus); + + // Compute weights for perturbed geometries + for(size_t itask = 0; itask < ntask; itask++) { + XCTask task_plus = tasks[itask]; + XCTask task_minus = tasks[itask]; + if (i_atom == (size_t)task_plus.iParent) { + for(size_t ipt = 0; ipt < task_plus.npts; ipt++) { + task_plus.points[ipt][i_coord] += delta; + task_minus.points[ipt][i_coord] -= delta; + } + } + task_plus.dist_nearest = meta_plus.dist_nearest()[task_plus.iParent]; + task_minus.dist_nearest = meta_minus.dist_nearest()[task_minus.iParent]; + + // Compute weights for perturbed geometries + double result_plus = 0.0, result_minus = 0.0; + compute_int(weight_alg, mol_plus, meta_plus, task_plus, f_evals[itask].data(), &result_plus); + compute_int(weight_alg, mol_minus, meta_minus, task_minus, f_evals[itask].data(), &result_minus); + + // Compute centered finite difference + exc_grad_w_ref[itask][3 * i_atom + i_coord] = + (result_plus - result_minus) / (2.0 * delta); + } + } + } + + // Construct Weights Module + MolecularWeightsFactory mw_factory(ExecutionSpace::Host, "Default", MolecularWeightsSettings{weight_alg, false}); + auto mw = mw_factory.get_instance(); + // Apply partition weights + mw.modify_weights(lb); + + // check lb.state().xc_weight_alg() == weight_alg; + REQUIRE(lb.state().weight_alg == weight_alg); + + auto lwd = LocalWorkDriverFactory::make_local_work_driver( ExecutionSpace::Host, "Default", LocalWorkSettings() ); + auto* lwd_host = dynamic_cast(lwd.get()); + + // Create MolMeta + MolMeta meta(mol); + + // Test derivatives for all tasks + std::vector> w_times_fs(ntask); + for(size_t task_idx = 0; task_idx < ntask; task_idx++) { + auto& task = tasks[task_idx]; + + INFO("Testing task " << task_idx << " with " << task.npts << " points"); + + auto w_times_f = w_times_fs[task_idx]; + w_times_f.resize(task.npts); + for(size_t i = 0; i < task.npts; i++) { + w_times_f[i] = task.weights[i] * f_evals[task_idx][i]; + } + + // Compute analytical derivatives + std::vector analytical_derivatives(3 * natoms); + lwd_host->eval_weight_1st_deriv_contracted(weight_alg, mol, meta, task, w_times_f.data(), analytical_derivatives.data()); + + // Compare with numerical derivatives + double max_error = 0.0; + for(size_t iatom = 0; iatom < natoms; iatom++) { + for(size_t icoord = 0; icoord < 3; icoord++) { + size_t idx = 3 * iatom + icoord; + double error = std::abs(analytical_derivatives[idx] - exc_grad_w_ref[task_idx][idx]); + max_error = std::max(max_error, error); + + INFO("Task " << task_idx << ", Atom " << iatom << ", Coord " << icoord + << " iParent: " << task.iParent); + INFO("Analytical: " << analytical_derivatives[idx]); + INFO("Numerical: " << exc_grad_w_ref[task_idx][idx]); + INFO("Error: " << error); + + REQUIRE(analytical_derivatives[idx] == Approx(exc_grad_w_ref[task_idx][idx]).margin(fdiff_tolerance)); + + } + } + + // Report statistics for this task + INFO("Task " << task_idx << " - Total derivatives tested: " << (task.npts * natoms * 3)); + INFO("Task " << task_idx << " - Maximum error: " << max_error); + } + + +} + +TEST_CASE("Weights First Derivative uncontracted HOST fidiff", "[weights_fdiff]") { + + + SECTION( "H3 Becke" ) { + test_weight_1st_deri_host_fdiff(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::Becke, + PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + SECTION( "H3 SSF" ) { + test_weight_1st_deri_host_fdiff(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::SSF, + PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + +} + + +TEST_CASE("Weights First Derivative contracted HOST fidiff", "[weights_fdiff]") { + + + SECTION( "H3 Becke" ) { + test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::Becke, + PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + + // SECTION( "Benzene Becke" ) { + // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5", XCWeightAlg::Becke, + // PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + + // SECTION( "Cytosine Becke" ) { + // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", XCWeightAlg::Becke, + // PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + + + SECTION( "H3 SSF" ) { + test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/h3_blyp_cc-pvdz_ssf_gks.bin", XCWeightAlg::SSF, + PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + // SECTION( "Benzene SSF" ) { + // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/benzene_svwn5_cc-pvdz_ufg_ssf.hdf5", XCWeightAlg::SSF, + // PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + + // SECTION( "Cytosine SSF" ) { + // test_weight_1st_deri_host_fdiff_contracted(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", XCWeightAlg::SSF, + // PruningScheme::Unpruned, 1.0e-5, 1.0e-6);} + + +} \ No newline at end of file diff --git a/tests/weights.cxx b/tests/weights.cxx index a56df0fc..e9069a52 100644 --- a/tests/weights.cxx +++ b/tests/weights.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/weights_cuda.hpp b/tests/weights_cuda.hpp index bd1561a5..3951cda4 100644 --- a/tests/weights_cuda.hpp +++ b/tests/weights_cuda.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/weights_generate.hpp b/tests/weights_generate.hpp index 7d586a8b..465c0bf8 100644 --- a/tests/weights_generate.hpp +++ b/tests/weights_generate.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/weights_hip.hpp b/tests/weights_hip.hpp index d5ad45f7..478a7556 100644 --- a/tests/weights_hip.hpp +++ b/tests/weights_hip.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/weights_host.hpp b/tests/weights_host.hpp index e78694d8..f9c51417 100644 --- a/tests/weights_host.hpp +++ b/tests/weights_host.hpp @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ diff --git a/tests/xc_integrator.cxx b/tests/xc_integrator.cxx index 88527fa8..947a9914 100644 --- a/tests/xc_integrator.cxx +++ b/tests/xc_integrator.cxx @@ -1,7 +1,11 @@ /** * GauXC Copyright (c) 2020-2024, The Regents of the University of California, * through Lawrence Berkeley National Laboratory (subject to receipt of - * any required approvals from the U.S. Dept. of Energy). All rights reserved. + * any required approvals from the U.S. Dept. of Energy). + * + * (c) 2024-2025, Microsoft Corporation + * + * All rights reserved. * * See LICENSE.txt for details */ @@ -37,8 +41,8 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, BasisSet basis; matrix_type P, Pz, Py, Px, VXC_ref, VXCz_ref, VXCy_ref, VXCx_ref, K_ref; double EXC_ref; - std::vector EXC_GRAD_ref; - bool has_k = false, has_exc_grad = false, rks = true, uks = false, gks = false; + std::vector EXC_GRAD_ref_HellFey, EXC_GRAD_ref_Full; + bool has_k = false, has_exc_grad_HellFey = false, has_exc_grad_full = false, rks = true, uks = false, gks = false; { read_hdf5_record( mol, reference_file, "/MOLECULE" ); read_hdf5_record( basis, reference_file, "/BASIS" ); @@ -110,11 +114,40 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, dset = file.getDataSet("/EXC"); dset.read( &EXC_ref ); - has_exc_grad = file.exist("/EXC_GRAD"); - if( has_exc_grad ) { - EXC_GRAD_ref.resize( 3*mol.size() ); + // Check for new unified /EXC_GRAD dataset with attribute + if( file.exist("/EXC_GRAD") ) { dset = file.getDataSet("/EXC_GRAD"); - dset.read( EXC_GRAD_ref.data() ); + EXC_GRAD_ref_Full.resize( 3*mol.size() ); + + // Check for attribute indicating whether weight derivatives are included + bool exc_grad_includes_weight_derivatives = false; // Default to Hellmann-Feynman + try { + auto attr = dset.getAttribute("includes_weight_derivatives"); + int attr_value; + attr.read( attr_value ); + exc_grad_includes_weight_derivatives = (attr_value != 0); + } catch(... ) { } + + if( exc_grad_includes_weight_derivatives ) { + dset.read( EXC_GRAD_ref_Full.data() ); + has_exc_grad_full = true; + } else { + dset.read( EXC_GRAD_ref_HellFey.data() ); + has_exc_grad_HellFey = true; + } + } + // Check for other type of EXC_GRAD + if( file.exist("/EXC_GRAD_HELLFEY") and not has_exc_grad_HellFey ) { + EXC_GRAD_ref_HellFey.resize( 3*mol.size() ); + dset = file.getDataSet("/EXC_GRAD_HELLFEY"); + dset.read( EXC_GRAD_ref_HellFey.data() ); + has_exc_grad_HellFey = true; + } + if( file.exist("/EXC_GRAD_FULL") and not has_exc_grad_full ) { + EXC_GRAD_ref_Full.resize( 3*mol.size() ); + dset = file.getDataSet("/EXC_GRAD_FULL"); + dset.read( EXC_GRAD_ref_Full.data() ); + has_exc_grad_full = true; } has_k = file.exist("/K"); @@ -125,7 +158,7 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, } } - if( (uks or gks) and ex == ExecutionSpace::Device and func.is_mgga() ) return; + if( gks and ex == ExecutionSpace::Device and func.is_mgga() ) return; for( auto& sh : basis ) sh.set_shell_tolerance( std::numeric_limits::epsilon() ); @@ -240,14 +273,29 @@ void test_xc_integrator( ExecutionSpace ex, const RuntimeEnvironment& rt, // Check EXC Grad - if( check_grad and has_exc_grad and rks) { - auto EXC_GRAD = integrator.eval_exc_grad( P ); + if( check_grad and has_exc_grad_full ) { + IntegratorSettingsEXC_GRAD exc_grad_settings; + exc_grad_settings.include_weight_derivatives = true; // Use full gradient (default) + auto EXC_GRAD = rks ? integrator.eval_exc_grad( P, exc_grad_settings ) : integrator.eval_exc_grad( P, Pz, exc_grad_settings ); using map_type = Eigen::Map; - map_type EXC_GRAD_ref_map( EXC_GRAD_ref.data(), mol.size(), 3 ); + map_type EXC_GRAD_ref_map( EXC_GRAD_ref_Full.data(), mol.size(), 3 ); map_type EXC_GRAD_map( EXC_GRAD.data(), mol.size(), 3 ); auto EXC_GRAD_diff_nrm = (EXC_GRAD_ref_map - EXC_GRAD_map).norm(); - CHECK( EXC_GRAD_diff_nrm / std::sqrt(3.0*mol.size()) < 1e-10 ); + INFO("comparing full gradient"); + CHECK( EXC_GRAD_diff_nrm / std::sqrt(3.0*mol.size()) < 1e-8 ); } + if( check_grad and has_exc_grad_HellFey ) { + IntegratorSettingsEXC_GRAD exc_grad_settings; + exc_grad_settings.include_weight_derivatives = false; // Use Hellmann-Feynman gradient + auto EXC_GRAD = rks ? integrator.eval_exc_grad( P, exc_grad_settings ) : integrator.eval_exc_grad( P, Pz, exc_grad_settings ); + using map_type = Eigen::Map; + map_type EXC_GRAD_ref_map( EXC_GRAD_ref_HellFey.data(), mol.size(), 3 ); + map_type EXC_GRAD_map( EXC_GRAD.data(), mol.size(), 3 ); + auto EXC_GRAD_diff_nrm = (EXC_GRAD_ref_map - EXC_GRAD_map).norm(); + INFO("comparing Hellmann-Feynman gradient"); + CHECK( EXC_GRAD_diff_nrm / std::sqrt(3.0*mol.size()) < 1e-8 ); + } + // Check K if( has_k and check_k and rks ) { @@ -311,12 +359,10 @@ void test_integrator(std::string reference_file, functional_type& func, PruningS #ifdef GAUXC_HAS_CUTLASS SECTION( "Incore - MPI Reduction - CUTLASS" ) { - if(not func.is_mgga() and not func.is_polarized()) { - test_xc_integrator( ExecutionSpace::Device, rt, - reference_file, func, pruning_scheme, - false, true, false, "Default", "Default", - "Scheme1-CUTLASS" ); - } + test_xc_integrator( ExecutionSpace::Device, rt, + reference_file, func, pruning_scheme, + true, true, false, "Default", "Default", + "Scheme1-CUTLASS" ); } #endif @@ -329,11 +375,11 @@ void test_integrator(std::string reference_file, functional_type& func, PruningS } #endif - SECTION( "ShellBatched" ) { - test_xc_integrator( ExecutionSpace::Device, rt, - reference_file, func, pruning_scheme, - false, false, false, "ShellBatched" ); - } + // SECTION( "ShellBatched" ) { + // test_xc_integrator( ExecutionSpace::Device, rt, + // reference_file, func, pruning_scheme, + // false, false, false, "ShellBatched" ); + // } } #endif @@ -353,6 +399,7 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { auto blyp = ExchCXX::Functional::BLYP; auto scan = ExchCXX::Functional::SCAN; auto r2scanl = ExchCXX::Functional::R2SCANL; + auto m062x = ExchCXX::Functional::M062X; // LDA Test SECTION( "Benzene / SVWN5 / cc-pVDZ" ) { @@ -384,6 +431,12 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { test_integrator(GAUXC_REF_DATA_PATH "/cytosine_scan_cc-pvdz_ufg_ssf_robust.hdf5", func, PruningScheme::Robust ); } + // This tests gradients + SECTION( "Benzene / M06-2X / def2-svp") { + auto func = make_functional(m062x, unpol); + test_integrator(GAUXC_REF_DATA_PATH "/benzene_m062x_def2-svp_ufg_ssf.hdf5", + func, PruningScheme::Unpruned ); + } // MGGA Test (TAU + LAPL) SECTION( "Cytosine / R2SCANL / cc-pVDZ") { @@ -398,6 +451,12 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { test_integrator(GAUXC_REF_DATA_PATH "/li_svwn5_sto3g_uks.bin", func, PruningScheme::Unpruned ); } + // + grad + SECTION( "Cytosine (doublet) / SVWN5 / cc-pVDZ") { + auto func = make_functional(svwn5, pol); + test_integrator(GAUXC_REF_DATA_PATH "/cytosine_svwn5_cc-pvdz_ufg_ssf_robust_uks.hdf5", + func, PruningScheme::Robust ); + } //UKS GGA Test SECTION( "Li / BLYP / sto-3g" ) { @@ -405,6 +464,12 @@ TEST_CASE( "XC Integrator", "[xc-integrator]" ) { test_integrator(GAUXC_REF_DATA_PATH "/li_blyp_sto3g_uks.bin", func, PruningScheme::Unpruned ); } + // + grad + SECTION( "Cytosine (doublet) / BLYP / cc-pVDZ") { + auto func = make_functional(blyp, pol); + test_integrator(GAUXC_REF_DATA_PATH "/cytosine_blyp_cc-pvdz_ufg_ssf_robust_uks.hdf5", + func, PruningScheme::Robust ); + } // UKS MGGA Test (TAU Only) SECTION( "Cytosine (doublet) / SCAN / cc-pVDZ") {