uxlfoundation
diff --git a/‎.ci/env/apt.sh‎
Lines changed: 3 additions & 3 deletions b/‎.ci/env/apt.sh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎.ci/pipeline/ci.yml‎
Lines changed: 4 additions & 1 deletion b/‎.ci/pipeline/ci.yml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/nightly-build.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/nightly-build.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎INSTALL.md‎
Lines changed: 5 additions & 5 deletions b/‎INSTALL.md‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎cpp/daal/src/algorithms/cosdistance/cosdistance_batch_impl.i‎
Lines changed: 13 additions & 1 deletion b/‎cpp/daal/src/algorithms/cosdistance/cosdistance_batch_impl.i‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎cpp/daal/src/algorithms/cosdistance/cosdistance_dense_default_batch_fpt_cpu.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/daal/src/algorithms/cosdistance/cosdistance_dense_default_batch_fpt_cpu.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/daal/src/algorithms/cosdistance/cosdistance_full_impl.i‎
Lines changed: 83 additions & 0 deletions b/‎cpp/daal/src/algorithms/cosdistance/cosdistance_full_impl.i‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_helper.h‎
Lines changed: 13 additions & 2 deletions b/‎cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_helper.h‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i‎
Lines changed: 4 additions & 2 deletions b/‎cpp/daal/src/algorithms/kmeans/kmeans_lloyd_impl.i‎
Lines changed: 4 additions & 2 deletions
@@ -31,19 +31,19 @@ function add_repo {
 }
 
 function install_dpcpp {
-    sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-2025.1 intel-oneapi-runtime-libs
+    sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-2025.2 intel-oneapi-runtime-libs
 }
 
 function install_tbb {
-    sudo apt-get install -y intel-oneapi-tbb-devel-2022.1
+    sudo apt-get install -y intel-oneapi-tbb-devel-2022.2
 }
 
 function install_dpl {
     sudo apt-get install -y intel-oneapi-libdpstd-devel
 }
 
 function install_mkl {
-    sudo apt-get install -y intel-oneapi-mkl-devel-2025.1
+    sudo apt-get install -y intel-oneapi-mkl-devel-2025.2
     install_tbb
     install_dpl
 }
 
@@ -31,7 +31,10 @@ variables:
   SYSROOT_OS: 'noble'
   PY_VERSION: '3.11'
   SKL_VERSION: '1.5'
-  WINDOWS_BASEKIT_URL: 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/487fd8c3-a3d8-4c22-a903-f8d54c2c57be/intel-oneapi-base-toolkit-2025.1.0.650_offline.exe'
+  # Link to latest version can be taken from basekit download page:
+  # https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
+  # Check section 'Install through a Command Line':
+  WINDOWS_BASEKIT_URL: 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/09a8acaf-265f-4460-866c-a3375ed5b4ff/intel-oneapi-base-toolkit-2025.2.0.591_offline.exe'
   WINDOWS_DPCPP_COMPONENTS: 'intel.oneapi.win.mkl.devel:intel.oneapi.win.tbb.devel:intel.oneapi.win.dpl'
 
 resources:
 
@@ -50,7 +50,7 @@ jobs:
   build_lnx:
     name: oneDAL Linux nightly build
     if: github.repository == 'uxlfoundation/oneDAL'
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-24.04
     timeout-minutes: 120
 
     steps:
 
@@ -123,31 +123,31 @@ is available as an alternative to the manual setup.
 
 7. Download and install Python (version 3.9 or higher).
 
-8. Build oneDAL via command-line interface. Choose the appropriate commands based on the interface, platform, compiler and the optimization level you use. Interface and platform are required arguments of makefile while others are optional. Below you can find the set of examples for building oneDAL. You may use a combination of them to get the desired build configuration:
+8. Build oneDAL via command-line interface. Choose the appropriate commands based on the interface, platform, compiler, linker and the optimization level you use. Interface and platform are required arguments of makefile while others are optional. Below you can find the set of examples for building oneDAL. You may use a combination of them to get the desired build configuration:
 
     - DAAL interfaces on **Linux\*** using **Intel(R) C++ Compiler**:
 
             make -f makefile daal PLAT=lnx32e
 
     - DAAL interfaces on **Linux\*** using **GNU Compiler Collection\***:
 
-            make -f makefile daal PLAT=lnx32e COMPILER=gnu OPTLEVEL=O0
+            make -f makefile daal PLAT=lnx32e COMPILER=gnu OPTLEVEL=O0 LINKER=bfd
 
     - DAAL interfaces on **Linux\*** using **Clang\***:
 
-            make -f makefile daal PLAT=lnx32e COMPILER=clang OPTLEVEL=O1
+            make -f makefile daal PLAT=lnx32e COMPILER=clang OPTLEVEL=O1 LINKER=gold
 
     - oneAPI C++/DPC++ interfaces on **Windows\*** using **Intel(R) DPC++ compiler**:
 
-            make -f makefile oneapi PLAT=win32e
+            make -f makefile oneapi PLAT=win32e LINKER=llvm-lib
 
     - oneAPI C++ interfaces on **Windows\*** using **Microsoft Visual\* C++ Compiler**:
 
             make -f makefile oneapi_c PLAT=win32e COMPILER=vc OPTLEVEL=O2
 
     - DAAL and oneAPI C++ interfaces on **Linux\*** using **GNU Compiler Collection\***:
 
-            make -f makefile daal oneapi_c PLAT=lnx32e COMPILER=gnu OPTLEVEL=O3
+            make -f makefile daal oneapi_c PLAT=lnx32e COMPILER=gnu OPTLEVEL=O3 LINKER=lld
 
 It is possible to build oneDAL libraries with selected set of algorithms and/or CPU optimizations. `CORE.ALGORITHMS.CUSTOM` and `REQCPUS` makefile defines are used for it.
 
 
@@ -63,7 +63,19 @@ services::Status DistanceKernel<algorithmFPType, method, cpu>::compute(const siz
 
     if (isFull<algorithmFPType, cpu>(rLayout))
     {
-        return cosDistanceFull<algorithmFPType, cpu>(xTable, rTable);
+        if (na == 1)
+        {
+            return cosDistanceFull<algorithmFPType, cpu>(xTable, rTable);
+        }
+        else if (na == 2)
+        {
+            NumericTable * yTable = const_cast<NumericTable *>(a[1]); /* y Input data */
+            return cosDistanceFull<algorithmFPType, cpu>(xTable, yTable, rTable);
+        }
+        else
+        {
+            return services::Status(services::ErrorIncorrectNumberOfInputNumericTables);
+        }
     }
     else
     {
 
@@ -38,7 +38,7 @@ template class BatchContainer<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
 }
 namespace internal
 {
-template class DistanceKernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+template class DAAL_EXPORT DistanceKernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
 
 } // namespace internal
 
 
@@ -234,6 +234,89 @@ services::Status cosDistanceFull(const NumericTable * xTable, NumericTable * rTa
     return safeStat.detach();
 }
 
+template <typename algorithmFPType, CpuType cpu>
+services::Status cosDistanceFull(const NumericTable * xTable, const NumericTable * yTable, NumericTable * rTable)
+{
+    size_t p         = xTable->getNumberOfColumns(); /* Dimension of input feature vector */
+    size_t nVectors1 = xTable->getNumberOfRows();    /* Number of input vectors in X */
+    size_t nVectors2 = yTable->getNumberOfRows();    /* Number of input vectors in Y */
+
+    size_t nBlocks1 = nVectors1 / blockSizeDefault;
+    nBlocks1 += (nBlocks1 * blockSizeDefault != nVectors1);
+
+    size_t nBlocks2 = nVectors2 / blockSizeDefault;
+    nBlocks2 += (nBlocks2 * blockSizeDefault != nVectors2);
+
+    SafeStatus safeStat;
+
+    /* compute results for blocks of the distance matrix */
+    daal::threader_for(nBlocks1, nBlocks1, [=, &safeStat](size_t k1) {
+        DAAL_INT blockSize1 = blockSizeDefault;
+        if (k1 == nBlocks1 - 1)
+        {
+            blockSize1 = nVectors1 - k1 * blockSizeDefault;
+        }
+
+        /* read access to blockSize1 rows in input dataset X at k1*blockSizeDefault*p row */
+        ReadRows<algorithmFPType, cpu> xBlock(*const_cast<NumericTable *>(xTable), k1 * blockSizeDefault, blockSize1);
+        DAAL_CHECK_BLOCK_STATUS_THR(xBlock);
+        const algorithmFPType * x = xBlock.get();
+
+        /* write access to blockSize1 rows in output dataset */
+        WriteOnlyRows<algorithmFPType, cpu> rBlock(rTable, k1 * blockSizeDefault, blockSize1);
+        DAAL_CHECK_BLOCK_STATUS_THR(rBlock);
+        algorithmFPType * r = rBlock.get();
+
+        for (size_t k2 = 0; k2 < nBlocks2; k2++)
+        {
+            DAAL_INT blockSize2 = blockSizeDefault;
+            if (k2 == nBlocks2 - 1)
+            {
+                blockSize2 = nVectors2 - k2 * blockSizeDefault;
+            }
+
+            size_t shift2 = k2 * blockSizeDefault;
+
+            /* read access to blockSize2 rows in input dataset Y */
+            ReadRows<algorithmFPType, cpu> yBlock(*const_cast<NumericTable *>(yTable), shift2, blockSize2);
+            DAAL_CHECK_BLOCK_STATUS_THR(yBlock);
+            const algorithmFPType * y = yBlock.get();
+
+            for (size_t i = 0; i < blockSize1; i++)
+            {
+                for (size_t j = 0; j < blockSize2; j++)
+                {
+                    algorithmFPType numerator = 0.0;
+                    algorithmFPType xNorm     = 0.0;
+                    algorithmFPType yNorm     = 0.0;
+
+                    for (size_t k = 0; k < p; k++)
+                    {
+                        numerator += x[i * p + k] * y[j * p + k];
+                        xNorm += x[i * p + k] * x[i * p + k];
+                        yNorm += y[j * p + k] * y[j * p + k];
+                    }
+
+                    algorithmFPType denominator = xNorm * yNorm;
+                    if (denominator > 0.0)
+                    {
+                        r[i * nVectors2 + shift2 + j] = 1.0
+                                                        - numerator
+                                                              / (daal::internal::MathInst<algorithmFPType, cpu>::sSqrt(xNorm)
+                                                                 * daal::internal::MathInst<algorithmFPType, cpu>::sSqrt(yNorm));
+                    }
+                    else
+                    {
+                        r[i * nVectors2 + shift2 + j] = 1.0; // Maximum distance when no variance
+                    }
+                }
+            }
+        }
+    });
+
+    return safeStat.detach();
+}
+
 } // namespace internal
 
 } // namespace cosine_distance
 
@@ -64,6 +64,7 @@ services::Status BatchContainer<algorithmFpType, method, cpu>::compute()
 
     const bool copy = (par->dataUseInModel == doNotUse);
     status |= r->impl()->setData<algorithmFpType>(x, copy);
+    DAAL_CHECK_STATUS_VAR(status);
     if ((par->resultsToEvaluate & daal::algorithms::classifier::computeClassLabels) != 0)
     {
         const NumericTablePtr y = input->get(classifier::training::labels);
 
@@ -159,6 +159,7 @@ inline services::Status HelperKernelRBF<double, sve>::postGemmPart(double * cons
 
     return services::Status();
 }
+
 //SVE implementation for RBF kernel post-GEMM part float data type
 template <>
 inline services::Status HelperKernelRBF<float, sve>::postGemmPart(float * const mklBuff, const float * const sqrA1i, const float sqrA2i,
@@ -185,6 +186,9 @@ inline services::Status HelperKernelRBF<float, sve>::postGemmPart(float * const
         tmp           = svsel_f32(mask, tmp, thresholdVec);
         svst1(pg, &mklBuff[i], tmp);
 
+        svfloat32_t expVal = daal::internal::ref::exp_vectorized(tmp);
+        svst1(pg, &dataRBlock[i], expVal);
+
         // Block 2
         mklVec = svld1(pg, &mklBuff[i + step]);
         sqrVec = svld1(pg, &sqrA1i[i + step]);
@@ -195,6 +199,9 @@ inline services::Status HelperKernelRBF<float, sve>::postGemmPart(float * const
         tmp  = svsel_f32(mask, tmp, thresholdVec);
         svst1(pg, &mklBuff[i + step], tmp);
 
+        expVal = daal::internal::ref::exp_vectorized(tmp);
+        svst1(pg, &dataRBlock[i + step], expVal);
+
         // Block 3
         mklVec = svld1(pg, &mklBuff[i + 2 * step]);
         sqrVec = svld1(pg, &sqrA1i[i + 2 * step]);
@@ -204,6 +211,9 @@ inline services::Status HelperKernelRBF<float, sve>::postGemmPart(float * const
         mask = svcmpgt_f32(pg, tmp, thresholdVec);
         tmp  = svsel_f32(mask, tmp, thresholdVec);
         svst1(pg, &mklBuff[i + 2 * step], tmp);
+
+        expVal = daal::internal::ref::exp_vectorized(tmp);
+        svst1(pg, &dataRBlock[i + 2 * step], expVal);
     }
 
     // Tail loop
@@ -218,9 +228,10 @@ inline services::Status HelperKernelRBF<float, sve>::postGemmPart(float * const
         svbool_t mask = svcmpgt_f32(tail_pg, tmp, thresholdVec);
         tmp           = svsel_f32(mask, tmp, thresholdVec);
         svst1(tail_pg, &mklBuff[i], tmp);
+
+        svfloat32_t expVal = daal::internal::ref::exp_vectorized(tmp);
+        svst1(tail_pg, &dataRBlock[i], expVal);
     }
-    //exponential function
-    MathInst<float, sve>::vExp(n, mklBuff, dataRBlock);
 
     return services::Status();
 }
 
@@ -302,7 +302,8 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer
         SpBlasInst<algorithmFPType, cpu>::xxcsrmm(&transa, &_n, &_c, &_p, &alpha, matdescra, data, (DAAL_INT *)colIdx, (DAAL_INT *)rowIdx, inClusters,
                                                   &_p, &beta, x_clusters, &_n);
 
-        size_t csrCursor = 0;
+        algorithmFPType goal = 0;
+        size_t csrCursor     = 0;
         for (size_t i = 0; i < blockSize; i++)
         {
             algorithmFPType minGoalVal = clustersSq[0] - x_clusters[i];
@@ -329,7 +330,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer
 
             kmeansInsertCandidate(tt, minGoalVal, k * blockSizeDefault + i);
 
-            *trg += minGoalVal;
+            goal += minGoalVal;
 
             cS0[minIdx]++;
 
@@ -339,6 +340,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer
                 assignments[i] = (int)minIdx;
             }
         }
+        *trg += goal;
     });
     return safeStat.detach();
 }
Original file line number	Diff line number	Diff line change
`@@ -31,19 +31,19 @@ function add_repo {`
`31`	`31`	`}`
`32`	`32`
`33`	`33`	`function install_dpcpp {`
`34`		`- sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-2025.1 intel-oneapi-runtime-libs`
	`34`	`+ sudo apt-get install -y intel-oneapi-compiler-dpcpp-cpp-2025.2 intel-oneapi-runtime-libs`
`35`	`35`	`}`
`36`	`36`
`37`	`37`	`function install_tbb {`
`38`		`- sudo apt-get install -y intel-oneapi-tbb-devel-2022.1`
	`38`	`+ sudo apt-get install -y intel-oneapi-tbb-devel-2022.2`
`39`	`39`	`}`
`40`	`40`
`41`	`41`	`function install_dpl {`
`42`	`42`	`sudo apt-get install -y intel-oneapi-libdpstd-devel`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`function install_mkl {`
`46`		`- sudo apt-get install -y intel-oneapi-mkl-devel-2025.1`
	`46`	`+ sudo apt-get install -y intel-oneapi-mkl-devel-2025.2`
`47`	`47`	`install_tbb`
`48`	`48`	`install_dpl`
`49`	`49`	`}`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ template class BatchContainer<DAAL_FPTYPE, defaultDense, DAAL_CPU>;`
`38`	`38`	`}`
`39`	`39`	`namespace internal`
`40`	`40`	`{`
`41`		`-template class DistanceKernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;`
	`41`	`+template class DAAL_EXPORT DistanceKernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;`
`42`	`42`
`43`	`43`	`} // namespace internal`
`44`	`44`
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ services::Status BatchContainer<algorithmFpType, method, cpu>::compute()`
`64`	`64`
`65`	`65`	`const bool copy = (par->dataUseInModel == doNotUse);`
`66`	`66`	`status \|= r->impl()->setData<algorithmFpType>(x, copy);`
	`67`	`+ DAAL_CHECK_STATUS_VAR(status);`
`67`	`68`	`if ((par->resultsToEvaluate & daal::algorithms::classifier::computeClassLabels) != 0)`
`68`	`69`	`{`
`69`	`70`	`const NumericTablePtr y = input->get(classifier::training::labels);`
Original file line number	Diff line number	Diff line change
`@@ -302,7 +302,8 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer`
`302`	`302`	`SpBlasInst<algorithmFPType, cpu>::xxcsrmm(&transa, &_n, &_c, &_p, &alpha, matdescra, data, (DAAL_INT )colIdx, (DAAL_INT )rowIdx, inClusters,`
`303`	`303`	`&_p, &beta, x_clusters, &_n);`
`304`	`304`
`305`		`- size_t csrCursor = 0;`
	`305`	`+ algorithmFPType goal = 0;`
	`306`	`+ size_t csrCursor = 0;`
`306`	`307`	`for (size_t i = 0; i < blockSize; i++)`
`307`	`308`	`{`
`308`	`309`	`algorithmFPType minGoalVal = clustersSq[0] - x_clusters[i];`
`@@ -329,7 +330,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer`
`329`	`330`
`330`	`331`	`kmeansInsertCandidate(tt, minGoalVal, k * blockSizeDefault + i);`
`331`	`332`
`332`		`- *trg += minGoalVal;`
	`333`	`+ goal += minGoalVal;`
`333`	`334`
`334`	`335`	`cS0[minIdx]++;`
`335`	`336`
`@@ -339,6 +340,7 @@ Status TaskKMeansLloyd<algorithmFPType, cpu>::addNTToTaskThreadedCSR(const Numer`
`339`	`340`	`assignments[i] = (int)minIdx;`
`340`	`341`	`}`
`341`	`342`	`}`
	`343`	`+ *trg += goal;`
`342`	`344`	`});`
`343`	`345`	`return safeStat.detach();`
`344`	`346`	`}`