From a312e5f239ce59ab504962f22e94e99260e7a457 Mon Sep 17 00:00:00 2001 From: Giorgio <122452088+GiorgioMB@users.noreply.github.com> Date: Fri, 3 May 2024 14:28:57 +0200 Subject: [PATCH 1/3] Delete packedForest/src/forestTypes/unsupervisedForests directory --- .../baseUnprocessedNodeUnsupervised.h | 97 ------- ...stratifiedInNodeClassIndicesUnsupervised.h | 217 --------------- .../unsupervisedForests/urerf/fpURerFBase.h | 142 ---------- .../unsupervisedForests/urerf/splitURerF.h | 125 --------- .../urerf/splitURerFInfo.h | 27 -- .../urerf/unprocessedURerFNode.h | 198 -------------- .../unsupervisedForests/urerf/urerfTree.h | 253 ------------------ .../unsupervisedForests/urf/fpURFBase.h | 154 ----------- .../unsupervisedForests/urf/splitURF.h | 125 --------- .../unsupervisedForests/urf/splitURFInfo.h | 29 -- .../urf/unprocessedURFNode.h | 185 ------------- .../unsupervisedForests/urf/urfTree.h | 252 ----------------- 12 files changed, 1804 deletions(-) delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/baseUnprocessedNodeUnsupervised.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/stratifiedInNodeClassIndicesUnsupervised.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urerf/fpURerFBase.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerF.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerFInfo.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urerf/unprocessedURerFNode.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urerf/urerfTree.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urf/fpURFBase.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urf/splitURF.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urf/splitURFInfo.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urf/unprocessedURFNode.h delete mode 100644 packedForest/src/forestTypes/unsupervisedForests/urf/urfTree.h diff --git a/packedForest/src/forestTypes/unsupervisedForests/baseUnprocessedNodeUnsupervised.h b/packedForest/src/forestTypes/unsupervisedForests/baseUnprocessedNodeUnsupervised.h deleted file mode 100644 index aed52129..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/baseUnprocessedNodeUnsupervised.h +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef baseUnprocessedNodeUnsupervised_h -#define baseUnprocessedNodeUnsupervised_h -#include "stratifiedInNodeClassIndicesUnsupervised.h" -#include - -namespace fp{ - - - template // - class baseUnprocessedNodeUnsupervised{ - protected: - int parentID; - int depth; - double nodeImpurity; //lower impurity is better - bool isLeftNode; //in order to set parent node with location - - - stratifiedInNodeClassIndicesUnsupervised* obsIndices; - stratifiedInNodeClassIndicesUnsupervised* leftIndices; - stratifiedInNodeClassIndicesUnsupervised* rightIndices; - - std::vector featureHolder; - - public: - baseUnprocessedNodeUnsupervised(int numObsForRoot): parentID(0), depth(0), isLeftNode(true){ - obsIndices = new stratifiedInNodeClassIndicesUnsupervised(numObsForRoot); - } - - baseUnprocessedNodeUnsupervised(int parentID, int dep, bool isLeft): parentID(parentID), depth(dep), isLeftNode(isLeft){} - - virtual ~baseUnprocessedNodeUnsupervised(){} - - - inline stratifiedInNodeClassIndicesUnsupervised* returnLeftIndices(){ - return leftIndices; - } - - inline stratifiedInNodeClassIndicesUnsupervised* returnRightIndices(){ - return rightIndices; - } - - inline stratifiedInNodeClassIndicesUnsupervised* returnObsIndices(){ - return obsIndices; - } - - inline int returnParentID(){ - return parentID; - } - - inline int returnDepth(){ - return depth; - } - inline double returnNodeImpurity(){ - return nodeImpurity; - } - - inline void setNodeImpurity(double nodeImp){ - nodeImpurity = nodeImp; - } - - inline bool isNodePure(){ - return nodeImpurity == 0; - } - - inline bool returnIsLeftNode(){ - return isLeftNode; - } - - inline int returnInSampleSize(){ - return obsIndices->returnInSampleSize(); - } - - inline int returnOutSampleSize(){ - return obsIndices->returnOutSampleSize(); - } - - inline void setHolderSizes(){ - obsIndices->initializeBinnedSamples(); - if(obsIndices->useBin()){ - featureHolder.resize(obsIndices->returnBinnedSize()); - }else{ - featureHolder.resize(obsIndices->returnInSampleSize()); - } - } - - inline float calculateNodeImpurity(){ - return obsIndices->returnImpurity(); - } - - inline void loadIndices(stratifiedInNodeClassIndicesUnsupervised* indices){ - obsIndices = indices; - } - - - }; //unprocessedNode.h -}//namespace fp -#endif //baseUnprocessedNodeUnsupervised_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/stratifiedInNodeClassIndicesUnsupervised.h b/packedForest/src/forestTypes/unsupervisedForests/stratifiedInNodeClassIndicesUnsupervised.h deleted file mode 100644 index 4229f5be..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/stratifiedInNodeClassIndicesUnsupervised.h +++ /dev/null @@ -1,217 +0,0 @@ -#ifndef stratifiedInNodeClassIndicesUnsupervised_h -#define stratifiedInNodeClassIndicesUnsupervised_h - -#include -#include -#include -#include -#include - -namespace fp{ - - class stratifiedInNodeClassIndicesUnsupervised - { - private: - std::vector > inSamples; - std::vector inSamps; - std::vector outSamps; - std::vector > outSamples; - std::vector binSamples; - int inSampleSize; - int outSampleSize; - double impurity; - - //TODO: the following functions would benefit from Vitter's Sequential Random Sampling - public: - stratifiedInNodeClassIndicesUnsupervised(): inSampleSize(0), outSampleSize(0){} - - - stratifiedInNodeClassIndicesUnsupervised(const int &numObservationsInDataSet): inSampleSize(0), outSampleSize(0){ - impurity = 10; //initialize to an arbitrary non zero value - createInAndOutSetsBagging(numObservationsInDataSet, 0); - inSampleSize = inSamps.size(); - outSampleSize = outSamps.size(); - } - - - inline void createInAndOutSets(const int &numObs){ - std::vector potentialSamples(numObs); - - std::random_device rd; // obtain a random number from hardware - std::mt19937 eng(rd()); // seed the generator - - std::uniform_int_distribution<> distr(0, numObs-1); - - for(int i=0; i < numObs; ++i){ - potentialSamples[i] = i; - } - - int numUnusedObs = numObs; - int randomObsID; - int tempMoveObs; - for(int n=0; n random_indices(numObs); - std::vector random_indices2; - std::vector random_indices3; - - for(int i=0; i < numObs; ++i){ - random_indices[i] = i; - } - std::random_shuffle(random_indices.begin(), random_indices.end()); - int indx = (int) ((1-bagging)*(float)numObs); - int counter = 0; - for(auto i : random_indices) - { - if(counter < indx) - random_indices2.push_back(i); - else - random_indices3.push_back(i); - counter++; - } - - for(auto randomObsID : random_indices2) - inSamps.push_back(randomObsID); - for(auto randomObsID2 : random_indices3) - outSamps.push_back(randomObsID2); - } - - inline void setNodeImpurity(double nodeImp){ - if(nodeImp < std::numeric_limits::epsilon()) - nodeImp = 0; - impurity = nodeImp; - } - inline double returnImpurity(){ - return impurity; - } - - inline void printIndices(){ - std::cout << "samples in bag\n"; - std::cout< returnInSampsVec(){ - return inSamps; - } - - inline std::vector returnOutSampsVec(){ - return outSamps; - } - - inline int returnInSampleSize(){ - return inSampleSize; - } - - - inline int returnOutSampleSize(){ - return outSampleSize; - } - - - inline int returnInSample(const int numSample){ - return inSamps[numSample]; - //The commented out below reduces memory size but is slow. - /* - int totalViewed = 0; - for(unsigned int i = 0; i < inSamples.size(); ++i){ - if(numSample < (totalViewed+int(inSamples[i].size()))){ - if((numSample-totalViewed)<0 || (numSample-totalViewed)>=int(inSamples[i].size())){ - std::cout << numSample-totalViewed << " , " << inSamples[i].size() << "\n"; - exit(1); - } - int retNum = inSamples[i][numSample-totalViewed]; - return retNum ; - } - totalViewed += inSamples[i].size(); - } - std::cout << "it happened now\n"; - exit(1); - return -1; - */ - } - - inline int returnOutSample(const int numSample){ - return outSamps[numSample]; - } - - inline int returnBinSize(){ - return fpSingleton::getSingleton().returnBinSize(); - } - - inline bool useBin(){ - return fpSingleton::getSingleton().returnUseBinning() && (inSampleSize > returnBinSize()); - } - - inline void initializeBinnedSamples(){ - /*if(useBin()){ - int numInClass; - std::random_device random_device; - std::mt19937 engine{random_device()}; - for(unsigned int i = 0; i < inSamples.size(); ++i){ - numInClass = int((returnBinSize()*inSamples[i].size())/inSampleSize); - for(int n = 0; n < numInClass; ++n){ - std::uniform_int_distribution dist(0, inSamples[i].size() - 1); - binSamples.push_back(inSamples[i][dist(engine)]); - } - } - }*/ - } - - - inline int returnBinnedSize(){ - return binSamples.size(); - } - - - inline int returnBinnedInSample(const int numSample){ - return binSamples[numSample]; - } - - - inline void addIndexToOutSamples(int index){ - ++outSampleSize; - outSamps.push_back(index); - } - - inline void addIndexToInSamples(int index){ - ++inSampleSize; - inSamps.push_back(index); - } - };//class stratifiedInNodeClassIndices - -}//namespace fp -#endif //stratifiedInNodeClassIndices_h - diff --git a/packedForest/src/forestTypes/unsupervisedForests/urerf/fpURerFBase.h b/packedForest/src/forestTypes/unsupervisedForests/urerf/fpURerFBase.h deleted file mode 100644 index 11ccfb92..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urerf/fpURerFBase.h +++ /dev/null @@ -1,142 +0,0 @@ -#ifndef fpRerF_h -#define fpRerf_h - -#include "../../../baseFunctions/fpForestBase.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "urerfTree.h" -#include - -namespace fp { - template - class fpURerFBase : public fpForestBase - { - protected: - std::vector > trees; - std::map > simMat; - std::map, double> pairMat; - typedef Eigen::SparseMatrix spMat; - typedef Eigen::Triplet TripType; - std::vector tripletList; - SpMat eigenMat; - public: - - ~fpURerFBase(){} - - fpDisplayProgress printProgress; - inline void printForestType(){ - std::cout << "This is a urerf forest.\n"; - } - - inline void changeForestSize(){ - trees.resize(fpSingleton::getSingleton().returnNumTrees()); - } - - inline void initSimMat(){ - auto numObs = fpSingleton::getSingleton().returnNumObservations(); - for(auto i = 0; i < numObs; ++i) { - std::map init_map; - simMat[i] = init_map; - } - } - - inline void createSparseMat(){ - //Not in use now. TODO: Remove entirely? - auto numObs = fpSingleton::getSingleton().returnNumObservations(); - SpMat eigenSimMat(numObs, numObs); - for (auto it=pairMat.begin(); it!=pairMat.end(); ++it) { - int i = (it->first).first; - int j = (it->first).second; - int v_ij = it->second; - eigenSimMat.coeffRef(i, j) = v_ij; - } - eigenSimMat.makeCompressed(); - this->eigenMat = eigenSimMat; - } - - inline void growTrees(){ -#pragma omp parallel for num_threads(fpSingleton::getSingleton().returnNumThreads()) - for(int i = 0; i < (int)trees.size(); ++i){ - trees[i].growTree(); - trees[i].updateSimMat(simMat, pairMat); - trees[i].updateSimMatOut(simMat, pairMat); - } - } - - - inline void checkParameters(){ - //TODO: check parameters to make sure they make sense for this forest type. - ; - } - - inline void treeStats(){ - int maxDepth=0; - int totalLeafNodes=0; - int totalLeafDepth=0; - - int tempMaxDepth; - for(int i = 0; i < fpSingleton::getSingleton().returnNumTrees(); ++i){ - tempMaxDepth = trees[i].returnMaxDepth(); - maxDepth = ((maxDepth < tempMaxDepth) ? tempMaxDepth : maxDepth); - - totalLeafNodes += trees[i].returnNumLeafNodes(); - totalLeafDepth += trees[i].returnLeafDepthSum(); - } - } - - - inline std::map > returnSimMat() { - return simMat; - } - - inline std::map, double> returnPairMat(){ - return pairMat; - } - - void printTree0(){ - trees[0].printTree(); - } - - void growForest(){ - changeForestSize(); - growTrees(); - treeStats(); - } - - inline int predictClass(std::vector &observation) - { - std::cout << "Not implemented for unsupervised forests\n"; - return 0; - } - - inline int predictClass(const T *observation) - { - std::cout << "Not implemented for unsupervised forests\n"; - return 0; - } - - inline float reportOOB() - { - return 0; - } - - inline std::vector predictClassPost(std::vector &observation) - { - std::cout << "Not implemented for unsupervised forests\n"; - return {}; - } - - inline float testForest() - { - return 0; - } - }; - -}// namespace fp -#endif diff --git a/packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerF.h b/packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerF.h deleted file mode 100644 index a62e0bf5..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerF.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef splitURerF_h -#define splitURerF_h - -#include "splitURerFInfo.h" -#include "../../../baseFunctions/pdqsort.h" -#include -#include -#include -#include -#include -#include -#include -namespace fp{ - template - class splitURerF{ - protected: - std::vector featureValsVec; - - static T computeSampleVariance(const double mean, const std::vector& v){ - double accum = 0.0; - std::for_each (std::begin(v), std::end(v), [&](const double d) { - accum += (d - mean) * (d - mean); - }); - return accum; - } - - inline void createData(const std::vector featureVals){ - featureValsVec.clear(); - auto siz_vec = featureVals.size(); - for(unsigned int i=0; i twoMeanSplit(const std::vector& featureVal, const std::vector& featureNums){ - - // initialize return value - splitURerFInfo currSplitInfo; - createData(featureVal); - - // sort feature Vals - std::vector errVecLeft; - std::vector errVecRight; - auto pbegin = featureValsVec.begin(); - auto pend = featureValsVec.end(); - std::sort(featureValsVec.begin(), featureValsVec.end()); - pbegin = featureValsVec.begin(); - pend = featureValsVec.end(); - int sizeX = featureValsVec.size(); - featureValsVec.erase(std::remove(pbegin, pend, 0), pend); - int sizeNNZ = featureValsVec.size(); - int sizeZ = sizeX - sizeNNZ; - T meanRight, sumLeft=0, meanLeft, cutPoint=0; - T errCurr = 0; - T minErr = std::numeric_limits::infinity(); - T minErrLeft = std::numeric_limits::infinity(); - T minErrRight = std::numeric_limits::infinity(); - T sumRight = std::accumulate(featureValsVec.begin(), featureValsVec.end(), 0.0); - pbegin = featureValsVec.begin(); - pend = featureValsVec.end(); - std::vector errVec(pbegin, pend); - - if (sizeNNZ - 1 <= 0){ - currSplitInfo.setImpurity(-1); - currSplitInfo.addFeatureNums(featureNums); - currSplitInfo.setSplitValue(0); - currSplitInfo.setLeftImpurity(0); - currSplitInfo.setRightImpurity(0); - return currSplitInfo; - } - if( fabs(featureValsVec[0] - featureValsVec[sizeX-1])<0.00001){ - currSplitInfo.setImpurity(-1); - currSplitInfo.addFeatureNums(featureNums); - currSplitInfo.setSplitValue(0); - currSplitInfo.setLeftImpurity(0); - currSplitInfo.setRightImpurity(0); - return currSplitInfo; - } - - if (sizeZ) { - meanRight = sumRight / (T)sizeNNZ; - minErr = computeSampleVariance(meanRight, errVec); - cutPoint = featureValsVec.at(0) / 2; - } - - - if (sizeNNZ - 1) { - int index = 1; - int sizeIt = featureValsVec.size()-1; - for(int iter = 0; iter < sizeIt; ++iter) { - int leftSize = sizeZ + index; - int rightSize = sizeNNZ - index; - sumLeft = sumLeft + featureValsVec[iter]; - sumRight = sumRight - featureValsVec[iter]; - meanLeft = sumLeft / (double)leftSize; - meanRight = sumRight / (double)rightSize; - auto last = pbegin; - std::advance(last, index); - std::vector newVec(pbegin, last); - auto errLeft = computeSampleVariance(meanLeft, newVec) + (sizeZ * meanLeft * meanLeft); - std::vector newVec2(last, pend); - auto errRight = computeSampleVariance(meanRight, newVec2); - errCurr = errLeft + errRight; - - if (errCurr < minErr) { - cutPoint = (featureValsVec[iter] + featureValsVec[iter+1]) / 2; - minErrLeft = errLeft; - minErrRight = errRight; - minErr = errCurr; - } - ++index; - } - } - currSplitInfo.setImpurity(minErr); - currSplitInfo.setSplitValue(cutPoint); - currSplitInfo.setLeftImpurity(minErrLeft); - currSplitInfo.setRightImpurity(minErrRight); - currSplitInfo.addFeatureNums(featureNums); - return currSplitInfo; - } - }; - -}//namespace fp -#endif //splitRerF_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerFInfo.h b/packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerFInfo.h deleted file mode 100644 index d9290c16..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urerf/splitURerFInfo.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef splitURerFInfo_h -#define splitURerFInfo_h - -#include "../../basicForests/baseSplitInfo.h" -#include - -namespace fp{ - - template - class splitURerFInfo :public baseSplitInfo > - { - protected: - - public: - - inline void addFeatureNums(const std::vector& fNum){ - baseSplitInfo >::featureNum = fNum; - } - - inline std::vector& returnFeatureNum(){ - return baseSplitInfo >::featureNum; - } - }; - - -}//namespace fp -#endif //splitRerFInfo_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urerf/unprocessedURerFNode.h b/packedForest/src/forestTypes/unsupervisedForests/urerf/unprocessedURerFNode.h deleted file mode 100644 index b9215195..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urerf/unprocessedURerFNode.h +++ /dev/null @@ -1,198 +0,0 @@ -#ifndef rfunprocessedURerFNode_h -#define rfunprocessedURerFNode_h -#include "splitURerF.h" -#include "../baseUnprocessedNodeUnsupervised.h" -#include "../stratifiedInNodeClassIndicesUnsupervised.h" -#include -#include -#include - -namespace fp{ - - - template // - class unprocessedURerFNode : public baseUnprocessedNodeUnsupervised{ - protected: - - splitURerFInfo bestSplitInfo; - std::vector< std::vector > featuresToTry; - - //std::random_device rd; - //The next three should be static - //std::mt19937 rng(rd()); - //std::uniform_int_distribution randomMtry(0,fpSingleton::getSingleton().returnMtry()-1); - //std::uniform_int_distribution randomFeature(0,fpSingleton::getSingleton().returnNumFeatures()-1); - //Example: auto random_integer = uni(rng); - - public: - unprocessedURerFNode(int numObsForRoot): baseUnprocessedNodeUnsupervised::baseUnprocessedNodeUnsupervised(numObsForRoot), featuresToTry(fpSingleton::getSingleton().returnMtry()){} - - - unprocessedURerFNode(int parentID, int dep, bool isLeft): baseUnprocessedNodeUnsupervised::baseUnprocessedNodeUnsupervised(parentID, dep, isLeft), featuresToTry(fpSingleton::getSingleton().returnMtry()){} - - - ~unprocessedURerFNode(){} - - - - inline std::vector& returnBestFeature(){ - return bestSplitInfo.returnFeatureNum(); - } - - inline double returnBestImpurity(){ - return bestSplitInfo.returnImpurity(); - } - - inline T returnBestCutValue(){ - return bestSplitInfo.returnSplitValue(); - } - - - inline void setBestSplit(splitURerFInfo tempSplit){ - if(tempSplit.returnImpurity() >= 0){ - if(tempSplit.returnImpurity() < bestSplitInfo.returnImpurity()){ - bestSplitInfo = tempSplit; - } - } - } - - inline void pickMTRY(){ - int rndMtry; - int rndFeature; - for (int i=0; i < fpSingleton::getSingleton().returnMtry(); ++i){ - rndMtry = std::rand() % fpSingleton::getSingleton().returnMtry(); - rndFeature = std::rand() % fpSingleton::getSingleton().returnNumFeatures(); - featuresToTry[rndMtry].push_back(rndFeature); - } - } - - inline void loadFeatureHolder(){ - if(baseUnprocessedNodeUnsupervised::obsIndices->useBin()){ - for(int q=0; q::obsIndices->returnBinnedSize(); q++){ - fpSingleton::getSingleton().prefetchFeatureVal(featuresToTry.back()[0],baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedInSample(q)); - } - - for(int i =0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedSize(); ++i){ - baseUnprocessedNodeUnsupervised::featureHolder[i] = fpSingleton::getSingleton().returnFeatureVal(featuresToTry.back()[0],baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedInSample(i)); - } - if(featuresToTry.back().size()>1){ - for(unsigned int j =1; j < featuresToTry.back().size(); ++j){ - for(int q=0; q::obsIndices->returnBinnedSize(); q++){ - fpSingleton::getSingleton().prefetchFeatureVal(featuresToTry.back()[j],baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedInSample(q)); - } - for(int i =0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedSize(); ++i){ - baseUnprocessedNodeUnsupervised::featureHolder[i] += fpSingleton::getSingleton().returnFeatureVal(featuresToTry.back()[j],baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedInSample(i)); - } - } - } - }else{ - - for(int q=0; q::obsIndices->returnInSampleSize(); q++){ - fpSingleton::getSingleton().prefetchFeatureVal(featuresToTry.back()[0],baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(q)); - } - - for(int i =0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnInSampleSize(); ++i){ - baseUnprocessedNodeUnsupervised::featureHolder[i] = fpSingleton::getSingleton().returnFeatureVal(featuresToTry.back()[0],baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - } - if(featuresToTry.back().size()>1){ - for(int j =1; j < (int)featuresToTry.back().size(); ++j){ - for(int q=0; q::obsIndices->returnInSampleSize(); q++){ - fpSingleton::getSingleton().prefetchFeatureVal(featuresToTry.back()[j],baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(q)); - } - - for(int i =0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnInSampleSize(); ++i){ - baseUnprocessedNodeUnsupervised::featureHolder[i] += fpSingleton::getSingleton().returnFeatureVal(featuresToTry.back()[j],baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - } - } - } - } - } - - inline void setupNode(){ - pickMTRY(); - baseUnprocessedNodeUnsupervised::setHolderSizes(); - baseUnprocessedNodeUnsupervised::setNodeImpurity(baseUnprocessedNodeUnsupervised::calculateNodeImpurity()); - } - - - inline bool goLeft(const int index){ - T featureVal = 0; - - for(auto j : bestSplitInfo.returnFeatureNum()){ - featureVal += fpSingleton::getSingleton().returnFeatureVal(j,index); - } - if(featureVal <= bestSplitInfo.returnSplitValue()){ - return true; - }else{ - return false; - } - } - - inline void deleteObsIndices(){ - delete baseUnprocessedNodeUnsupervised::obsIndices; - baseUnprocessedNodeUnsupervised::obsIndices = NULL; - } - - inline void moveDataLeftOrRight(){ - - baseUnprocessedNodeUnsupervised::leftIndices = new stratifiedInNodeClassIndicesUnsupervised(); - baseUnprocessedNodeUnsupervised::rightIndices = new stratifiedInNodeClassIndicesUnsupervised(); - - int lNum =0; - int rNum =0; - for (int i=0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnInSampleSize();++i){ - if(goLeft(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i))){ - ++lNum; - baseUnprocessedNodeUnsupervised::leftIndices->addIndexToInSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - }else{ - ++rNum; - baseUnprocessedNodeUnsupervised::rightIndices->addIndexToInSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - } - } - - assert(lNum > 0); - assert(rNum > 0); - - - for (int i=0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnOutSampleSize();++i){ - if(goLeft(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i))){ - baseUnprocessedNodeUnsupervised::leftIndices->addIndexToOutSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - }else{ - baseUnprocessedNodeUnsupervised::rightIndices->addIndexToOutSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - } - } - baseUnprocessedNodeUnsupervised::leftIndices->setNodeImpurity(bestSplitInfo.returnImpurity()); - baseUnprocessedNodeUnsupervised::rightIndices->setNodeImpurity(bestSplitInfo.returnImpurity()); - deleteObsIndices(); - } - - - inline void findBestSplit(){ - //timeLogger logTime; - splitURerF findSplit; //This is done twice - //TODO This needs to change to real mtry - // std::vector tempVec; - // tempVec.push_back(0); - - while(!featuresToTry.empty()){ - //not all featuresToTry will be populated. This checks first. - if(!featuresToTry.back().empty()){ - loadFeatureHolder(); - setBestSplit(findSplit.twoMeanSplit(baseUnprocessedNodeUnsupervised::featureHolder ,featuresToTry.back())); - } - removeTriedMtry(); - } - } - - inline void removeTriedMtry(){ - if(bestSplitInfo.perfectSplitFound()){ - featuresToTry.clear(); - }else{ - featuresToTry.pop_back(); - } - } - - - }; //unprocessedNode.h -}//namespace fp -#endif //unprocessedNode_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urerf/urerfTree.h b/packedForest/src/forestTypes/unsupervisedForests/urerf/urerfTree.h deleted file mode 100644 index a5db629e..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urerf/urerfTree.h +++ /dev/null @@ -1,253 +0,0 @@ -#ifndef urerfTree_h -#define urerfTree_h -#include "../../../baseFunctions/fpBaseNode.h" -#include "unprocessedURerFNode.h" -#include -#include -#include -#include - -namespace fp{ - - template - class urerfTree - { - protected: - float OOBAccuracy; - float totalOOB; - std::vector > indexAndVote; - std::vector< fpBaseNode > > tree; - std::vector< unprocessedURerFNode > nodeQueue; - std::vector< unprocessedURerFNode > leafNodes; - - public: - urerfTree() : totalOOB(0){} - - void loadFirstNode(){ - nodeQueue.emplace_back(fpSingleton::getSingleton().returnNumObservations()); - } - - inline bool shouldProcessNode(){ - if(nodeQueue.back().returnNodeImpurity() < std::numeric_limits::epsilon()) - return false; - if(nodeQueue.back().returnInSampleSize() <= fpSingleton::getSingleton().returnMinParent()) - return false; - if(nodeQueue.back().returnDepth() >= fpSingleton::getSingleton().returnMaxDepth()) - return false; - return true; - } - - inline std::vector > returnOOBvotes(){ - return indexAndVote; - } - - inline int returnLastNodeID(){ - return tree.size()-1; - } - - inline void linkParentToChild(){ - if(nodeQueue.back().returnIsLeftNode()){ - tree[nodeQueue.back().returnParentID()].setLeftValue(returnLastNodeID()); - }else{ - tree[nodeQueue.back().returnParentID()].setRightValue(returnLastNodeID()); - } - } - - - inline int returnMaxDepth(){ - int maxDepth=0; - for(auto nodes : tree){ - if(maxDepth < nodes.returnDepth()){ - maxDepth = nodes.returnDepth(); - } - } - return maxDepth; - } - - - inline int returnNumLeafNodes(){ - int numLeafNodes=0; - for(auto nodes : tree){ - if(!nodes.isInternalNode()){ - ++numLeafNodes; - } - } - return numLeafNodes; - } - - - - inline void updateSimMat(std::map > &simMat, std::map, double> &pairMat){ - for(auto nodes : leafNodes){ - stratifiedInNodeClassIndicesUnsupervised* obsI = nodes.returnObsIndices(); - std::vector leafObs; - std::vector leafObsOut; - leafObs = obsI->returnInSampsVec(); - leafObsOut = obsI->returnOutSampsVec(); - auto siz = leafObs.size(); - if (siz <= 0) - continue; - for(unsigned int i = 0; i < siz; ++i) { - for (unsigned int j=0; j<=i; ++j) { - std::pair pair1 = std::make_pair(leafObs[i], leafObs[j]); - - if(pairMat.count(pair1) > 0){ - #pragma omp critical - { - pairMat[pair1]++; - } - } - else{ - #pragma omp critical - { - pairMat.insert({pair1, 1}); - } - } - } - } - } - } - - inline void updateSimMatOut(std::map > &simMat, std::map, double> &pairMat){ - for(auto nodes : leafNodes){ - stratifiedInNodeClassIndicesUnsupervised* obsI = nodes.returnObsIndices(); - std::vector leafObs; - leafObs = obsI->returnOutSampsVec(); - auto siz = leafObs.size(); - if (siz <= 0) - continue; - - for(unsigned int i = 0; i < siz; ++i) { - for (unsigned int j=0; j<=i; ++j) { - std::pair pair1 = std::make_pair(leafObs[i], leafObs[j]); - #pragma omp critical - { - auto it = pairMat.find(pair1); - if(it!=pairMat.end()) - pairMat[pair1]++; - else - pairMat.insert({pair1, 1}); - } - } - } - } - } - - inline int returnLeafDepthSum(){ - int leafDepthSums=0; - for(auto nodes : tree){ - if(!nodes.isInternalNode()){ - leafDepthSums += nodes.returnDepth(); - } - } - return leafDepthSums; - } - - inline void setAsLeaf(){ - tree.back().setDepth(nodeQueue.back().returnDepth()); - } - - inline void makeWholeNodeALeaf(){ - tree.emplace_back(); - linkParentToChild(); - setAsLeaf(); - leafNodes.emplace_back(nodeQueue.back()); - nodeQueue.pop_back(); - } - - void printTree(){ - for(auto nd : tree){ - nd.printNode(); - } - } - - inline void createNodeInTree(){ - tree.emplace_back(); - linkParentToChild(); - tree.back().setCutValue(nodeQueue.back().returnBestCutValue()); - tree.back().setFeatureValue(nodeQueue.back().returnBestFeature()); - tree.back().setDepth(nodeQueue.back().returnDepth()); - } - - - inline void makeNodeInternal(){ - createNodeInTree(); - createChildren(); - } - - inline bool isLeftNode(){ - return true; - } - - inline bool isRightNode(){ - return false; - } - - inline void createChildren(){ - nodeQueue.back().moveDataLeftOrRight(); - - stratifiedInNodeClassIndicesUnsupervised* leftIndices = nodeQueue.back().returnLeftIndices(); - stratifiedInNodeClassIndicesUnsupervised* rightIndices = nodeQueue.back().returnRightIndices(); - - assert(leftIndices->returnInSampleSize() > 0); - assert(rightIndices->returnInSampleSize() > 0); - - int childDepth = nodeQueue.back().returnDepth()+1; - - nodeQueue.pop_back(); - - nodeQueue.emplace_back(returnLastNodeID(),childDepth, isLeftNode()); - nodeQueue.back().loadIndices(leftIndices); - - nodeQueue.emplace_back(returnLastNodeID(),childDepth, isRightNode()); - nodeQueue.back().loadIndices(rightIndices); - } - - - inline void findTheBestSplit(){ - nodeQueue.back().findBestSplit(); - } - - - inline bool noGoodSplitFound(){ - if(nodeQueue.back().returnBestImpurity() == -1) - return true; - - return nodeQueue.back().returnBestFeature().empty(); - } - - - inline void processANode(){ - // timeLogger logTime; - // logTime.startSortTimer(); - nodeQueue.back().setupNode(); - // logTime.stopSortTimer(); - // logTime.startGiniTimer(); - if(shouldProcessNode()){ - findTheBestSplit(); - if(noGoodSplitFound()){ - makeWholeNodeALeaf(); - }else{ - makeNodeInternal(); - } - }else{ - makeWholeNodeALeaf(); - } - // logTime.stopGiniTimer(); - } - - inline void processNodes(){ - while(!nodeQueue.empty()){ - processANode(); - } - } - - inline void growTree(){ - loadFirstNode(); - processNodes(); - } - - }; - -}//fp -#endif //urerfTree_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urf/fpURFBase.h b/packedForest/src/forestTypes/unsupervisedForests/urf/fpURFBase.h deleted file mode 100644 index 8a1964ec..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urf/fpURFBase.h +++ /dev/null @@ -1,154 +0,0 @@ -#ifndef fpURF_h -#define fpURF_h - -#include "../../../baseFunctions/fpForestBase.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "urfTree.h" -#include -#include -#include -#include -using namespace Eigen; - -namespace fp { - - template - class fpURFBase : public fpForestBase - { - protected: - std::vector > trees; - std::map > simMat; - std::map, double> pairMat; - typedef Eigen::SparseMatrix spMat; - typedef Eigen::Triplet TripType; - std::vector tripletList; - SpMat eigenMat; - public: - - ~fpURFBase(){} - - fpDisplayProgress printProgress; - inline void printForestType(){ - std::cout << "This is a urf forest.\n"; - } - - inline void changeForestSize(){ - trees.resize(fpSingleton::getSingleton().returnNumTrees()); - } - - inline void initSimMat(){ - auto numObs = fpSingleton::getSingleton().returnNumObservations(); - for(auto i = 0; i < numObs; ++i) { - std::map init_map; - simMat[i] = init_map; - } - } - inline void growTrees(){ -#pragma omp parallel for num_threads(fpSingleton::getSingleton().returnNumThreads()) - for(int i = 0; i < (int)trees.size(); ++i){ - trees[i].growTree(); - trees[i].updateSimMat(simMat, pairMat); - trees[i].updateSimMatOut(simMat, pairMat); - } - } - - inline void checkParameters(){ - //TODO: check parameters to make sure they make sense for this forest type. - ; - } - - inline void createSparseMat(){ - //Not in use now. TODO: Remove entirely? - auto numObs = fpSingleton::getSingleton().returnNumObservations(); - SpMat eigenSimMat(numObs, numObs); - for (auto it=pairMat.begin(); it!=pairMat.end(); ++it){ - int i = (it->first).first; - int j = (it->first).second; - int v_ij = it->second; - eigenSimMat.coeffRef(i, j) = v_ij; - } - eigenSimMat.makeCompressed(); - this->eigenMat = eigenSimMat ; - } - - - inline void printSparseMat(){ - //Not in use now. TODO: Remove entirely? - for (int k = 0; k < eigenMat.outerSize(); ++k){ - for (Eigen::SparseMatrix::InnerIterator it(eigenMat, k); it; ++it){ - std::cout << it.row() <<"\t"; - std::cout << it.col() << "\t"; - std::cout << it.value() << "\n"; - } - } - } - - inline void treeStats(){ - int maxDepth=0; - int totalLeafNodes=0; - int totalLeafDepth=0; - - int tempMaxDepth; - for(int i = 0; i < fpSingleton::getSingleton().returnNumTrees(); ++i){ - tempMaxDepth = trees[i].returnMaxDepth(); - maxDepth = ((maxDepth < tempMaxDepth) ? tempMaxDepth : maxDepth); - - totalLeafNodes += trees[i].returnNumLeafNodes(); - totalLeafDepth += trees[i].returnLeafDepthSum(); - } - } - - - inline std::map > returnSimMat() { - return simMat; - } - - inline std::map, double> returnPairMat(){ - return pairMat; - } - - void printTree0(){ - trees[0].printTree(); - } - - void growForest(){ - changeForestSize(); - growTrees(); - treeStats(); - } - - inline int predictClass(std::vector& observation){ - std::cout<<"Not defined for unsupervised random forests. \n"; - return 0; - } - - inline int predictClass(const T *observation) - { - std::cout << "Not defined for unsupervised random forests. \n"; - return 0; - } - inline std::vector predictClassPost(std::vector &observation) - { - std::cout << "Not defined for unsupervised random forests. \n"; - return {}; - } - - inline float reportOOB() - { - return 0; - } - inline float testForest() - { - return 0; - } - }; - -}// namespace fp -#endif diff --git a/packedForest/src/forestTypes/unsupervisedForests/urf/splitURF.h b/packedForest/src/forestTypes/unsupervisedForests/urf/splitURF.h deleted file mode 100644 index bd1213ee..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urf/splitURF.h +++ /dev/null @@ -1,125 +0,0 @@ -#ifndef splitURF_h -#define splitURF_h - -#include "splitURFInfo.h" -#include "../../../baseFunctions/pdqsort.h" -#include "../../../baseFunctions/timeLogger.h" -#include -#include -#include -#include -#include -#include -namespace fp{ - template - class splitURF{ - protected: - std::vector featureValsVec; - - static T computeSampleVariance(const double mean, const std::vector& v){ - double accum = 0.0; - std::for_each (std::begin(v), std::end(v), [&](const double d) { - accum += (d - mean) * (d - mean); - }); - return accum; - } - - inline void createData(const std::vector featureVals){ - featureValsVec.clear(); - auto siz_vec = featureVals.size(); - for(unsigned int i=0; i twoMeanSplit(const std::vector& featureVal, int featureNum){ - // initialize return value - splitURFInfo currSplitInfo; - createData(featureVal); - - // sort feature Vals - std::vector errVecLeft; - std::vector errVecRight; - auto pbegin = featureValsVec.begin(); - auto pend = featureValsVec.end(); - - //std::sort(featureValsVec.begin(), featureValsVec.end()); - int sizeX = featureValsVec.size(); - pdqsort_branchless(featureValsVec.begin(), featureValsVec.end()); - if(featureValsVec[0] == featureValsVec[sizeX-1]){ - currSplitInfo.setImpurity(-1); - currSplitInfo.setFeatureNums(featureNum); - currSplitInfo.setSplitValue(0); - currSplitInfo.setLeftImpurity(0); - currSplitInfo.setRightImpurity(0); - return currSplitInfo; - } - pbegin = featureValsVec.begin(); - pend = featureValsVec.end(); - featureValsVec.erase(std::remove(featureValsVec.begin(), featureValsVec.end(), 0), featureValsVec.end()); - int sizeNNZ = featureValsVec.size(); - int sizeZ = sizeX - sizeNNZ; - int leftSize; - int rightSize; - T meanRight, sumLeft=0, meanLeft=0, cutPoint=0; - T minErr = std::numeric_limits::infinity(); - T errCurr=0; - T minErrLeft = std::numeric_limits::infinity(); - T minErrRight = std::numeric_limits::infinity(); - T sumRight = std::accumulate(featureValsVec.begin(), featureValsVec.end(), 0.0); - pbegin = featureValsVec.begin(); - pend = featureValsVec.end(); - std::vector errVec(pbegin, pend); - - if (sizeNNZ - 1 <= 0){ - return currSplitInfo; - } - - - if (sizeZ) { - meanRight = sumRight / (T)sizeNNZ; - minErr = computeSampleVariance(meanRight, errVec); - cutPoint = featureValsVec.at(0) / 2; - } - - - if (sizeNNZ - 1) { - int index = 1; - int sizeIt = featureValsVec.size()-1; - for(int iter = 0; iter < sizeIt; ++iter) { - leftSize = sizeZ + index; - rightSize = sizeNNZ - index; - sumLeft = sumLeft + featureValsVec[iter]; - sumRight = sumRight - featureValsVec[iter]; - meanLeft = sumLeft / (double)leftSize; - meanRight = sumRight / (double)rightSize; - auto last = pbegin; - std::advance(last, index); - std::vector newVec(pbegin, last); - auto errLeft = computeSampleVariance(meanLeft, newVec) + (sizeZ * meanLeft * meanLeft); - std::vector newVec2(last, pend); - auto errRight = computeSampleVariance(meanRight, newVec2); - errCurr = errLeft + errRight; - if (errCurr < minErr) { - cutPoint = (featureValsVec[iter] + featureValsVec[iter+1]) / 2; - minErrLeft = errLeft; - minErrRight = errRight; - minErr = errCurr; - } - ++index; - } - } - currSplitInfo.setImpurity(minErr); - currSplitInfo.setFeatureNums(featureNum); - currSplitInfo.setSplitValue(cutPoint); - currSplitInfo.setLeftImpurity(minErrLeft); - currSplitInfo.setRightImpurity(minErrRight); - - - return currSplitInfo; - } - }; - -}//namespace fp -#endif //splitURF_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urf/splitURFInfo.h b/packedForest/src/forestTypes/unsupervisedForests/urf/splitURFInfo.h deleted file mode 100644 index 92118e1f..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urf/splitURFInfo.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef splitURFInfo_h -#define splitURFInfo_h - -#include "../../basicForests/baseSplitInfo.h" -#include - -namespace fp{ - - template - class splitURFInfo :public baseSplitInfo - { - protected: - - public: - splitURFInfo(){ - baseSplitInfo::featureNum = -1; - } - inline void setFeatureNums(int fNum){ - baseSplitInfo::featureNum = fNum; - } - - inline int returnFeatureNum(){ - return baseSplitInfo::featureNum; - } - }; - - -}//namespace fp -#endif //splitRFInfo_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urf/unprocessedURFNode.h b/packedForest/src/forestTypes/unsupervisedForests/urf/unprocessedURFNode.h deleted file mode 100644 index f9765452..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urf/unprocessedURFNode.h +++ /dev/null @@ -1,185 +0,0 @@ -#ifndef rfunprocessedURFNode_h -#define rfunprocessedURFNode_h -#include "splitURF.h" -#include "../baseUnprocessedNodeUnsupervised.h" -#include "../stratifiedInNodeClassIndicesUnsupervised.h" -#include -#include -#include - -namespace fp{ - - - template // - class unprocessedURFNode : public baseUnprocessedNodeUnsupervised{ - protected: - - splitURFInfo bestSplitInfo; - std::vector featuresToTry; - - //std::random_device rd; - //The next three should be static - //std::mt19937 rng(rd()); - //std::uniform_int_distribution randomMtry(0,fpSingleton::getSingleton().returnMtry()-1); - //std::uniform_int_distribution randomFeature(0,fpSingleton::getSingleton().returnNumFeatures()-1); - //Example: auto random_integer = uni(rng); - - public: - unprocessedURFNode(int numObsForRoot): baseUnprocessedNodeUnsupervised::baseUnprocessedNodeUnsupervised(numObsForRoot), featuresToTry(fpSingleton::getSingleton().returnMtry()){} - - - unprocessedURFNode(int parentID, int dep, bool isLeft): baseUnprocessedNodeUnsupervised::baseUnprocessedNodeUnsupervised(parentID, dep, isLeft){ -} - - ~unprocessedURFNode(){} - - - inline int returnBestFeature(){ - return bestSplitInfo.returnFeatureNum(); - } - - inline double returnBestImpurity(){ - return bestSplitInfo.returnImpurity(); - } - - inline T returnBestCutValue(){ - return bestSplitInfo.returnSplitValue(); - } - - inline void setBestSplit(splitURFInfo tempSplit){ - if(tempSplit.returnImpurity() >= 0){ - if(tempSplit.returnImpurity() < bestSplitInfo.returnImpurity()){ - bestSplitInfo = tempSplit; - } - } - } - - inline void pickMTRY(){ - for (int i=0; i < fpSingleton::getSingleton().returnNumFeatures(); ++i){ - featuresToTry.push_back(i); - } - std::random_device rd; // obtain a random number from hardware - std::mt19937 eng(rd()); // seed the generator - - int tempSwap; - - for(int locationToMove = 0; locationToMove < fpSingleton::getSingleton().returnMtry(); locationToMove++){ - std::uniform_int_distribution<> distr(locationToMove, fpSingleton::getSingleton().returnNumFeatures()-1); - int randomPosition = distr(eng); - tempSwap = featuresToTry[locationToMove]; - featuresToTry[locationToMove] = featuresToTry[randomPosition]; - featuresToTry[randomPosition] = tempSwap; - } - featuresToTry.resize(fpSingleton::getSingleton().returnMtry()); - } - - inline void loadFeatureHolder(){ - if(baseUnprocessedNodeUnsupervised::obsIndices->useBin()){ - for(int q=0; q::obsIndices->returnBinnedSize(); q++){ - fpSingleton::getSingleton().prefetchFeatureVal(featuresToTry.back(),baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedInSample(q)); - } - - for(int i =0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedSize(); ++i){ - baseUnprocessedNodeUnsupervised::featureHolder[i] = fpSingleton::getSingleton().returnFeatureVal(featuresToTry.back(),baseUnprocessedNodeUnsupervised::obsIndices->returnBinnedInSample(i)); - } - }else{ - - for(int q=0; q::obsIndices->returnInSampleSize(); q++){ - fpSingleton::getSingleton().prefetchFeatureVal(featuresToTry.back(),baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(q)); - } - - for(int i =0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnInSampleSize(); ++i){ - baseUnprocessedNodeUnsupervised::featureHolder[i] = fpSingleton::getSingleton().returnFeatureVal(featuresToTry.back(), baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - } - } - } - - inline void setupNode(){ - pickMTRY(); - baseUnprocessedNodeUnsupervised::setHolderSizes(); - baseUnprocessedNodeUnsupervised::setNodeImpurity(baseUnprocessedNodeUnsupervised::calculateNodeImpurity()); - } - - - inline bool goLeft(const int index){ - T featureVal = fpSingleton::getSingleton().returnFeatureVal(bestSplitInfo.returnFeatureNum(),index); - - if(featureVal <= bestSplitInfo.returnSplitValue()){ - return true; - }else{ - return false; - } - } - - inline void deleteObsIndices(){ - delete baseUnprocessedNodeUnsupervised::obsIndices; - baseUnprocessedNodeUnsupervised::obsIndices = NULL; - } - - inline void moveDataLeftOrRight(){ - - baseUnprocessedNodeUnsupervised::leftIndices = new stratifiedInNodeClassIndicesUnsupervised(); - baseUnprocessedNodeUnsupervised::rightIndices = new stratifiedInNodeClassIndicesUnsupervised(); - - int lNum =0; - int rNum =0; - for (int i=0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnInSampleSize();++i){ - if(goLeft(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i))){ - ++lNum; - baseUnprocessedNodeUnsupervised::leftIndices->addIndexToInSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - }else{ - ++rNum; - baseUnprocessedNodeUnsupervised::rightIndices->addIndexToInSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnInSample(i)); - } - } - if(lNum <= 0 || rNum <= 0){ - lNum++; - rNum--; - } - - assert(lNum > 0); - assert(rNum > 0); - - - for (int i=0; i < baseUnprocessedNodeUnsupervised::obsIndices->returnOutSampleSize();++i){ - if(goLeft(baseUnprocessedNodeUnsupervised::obsIndices->returnOutSample(i))){ - baseUnprocessedNodeUnsupervised::leftIndices->addIndexToOutSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnOutSample(i)); - }else{ - baseUnprocessedNodeUnsupervised::rightIndices->addIndexToOutSamples(baseUnprocessedNodeUnsupervised::obsIndices->returnOutSample(i)); - } - } - baseUnprocessedNodeUnsupervised::leftIndices->setNodeImpurity(bestSplitInfo.returnImpurity()); - baseUnprocessedNodeUnsupervised::rightIndices->setNodeImpurity(bestSplitInfo.returnImpurity()); - deleteObsIndices(); - } - - - inline void findBestSplit(){ - //timeLogger logTime; - splitURF findSplit; //This is done twice - //TODO This needs to change to real mtry - // std::vector tempVec; - // tempVec.push_back(0); - while(!featuresToTry.empty()){ - //not all featuresToTry will be populated. This checks first. - if(!featuresToTry.empty()){ - loadFeatureHolder(); - setBestSplit(findSplit.twoMeanSplit(baseUnprocessedNodeUnsupervised::featureHolder ,featuresToTry.back())); - } - removeTriedMtry(); - } - - } - - inline void removeTriedMtry(){ - if(bestSplitInfo.perfectSplitFound()){ - featuresToTry.clear(); - }else{ - featuresToTry.pop_back(); - } - } - - - }; //unprocessedNode.h -}//namespace fp -#endif //unprocessedURFNode_h diff --git a/packedForest/src/forestTypes/unsupervisedForests/urf/urfTree.h b/packedForest/src/forestTypes/unsupervisedForests/urf/urfTree.h deleted file mode 100644 index a5ce5a4e..00000000 --- a/packedForest/src/forestTypes/unsupervisedForests/urf/urfTree.h +++ /dev/null @@ -1,252 +0,0 @@ -#ifndef urfTree_h -#define urfTree_h -#include "../../../baseFunctions/fpBaseNode.h" -#include "unprocessedURFNode.h" -#include -#include -#include -#include -#include -#include -#include - -typedef Eigen::SparseMatrix SpMat; -typedef Eigen::Triplet TripType; - -namespace fp{ - - template - class urfTree - { - protected: - float OOBAccuracy; - float correctOOB; - float totalOOB; - std::vector< fpBaseNode > tree; - std::vector< unprocessedURFNode > nodeQueue; - std::vector< unprocessedURFNode > leafNodes; - - public: - urfTree() : OOBAccuracy(-1.0),correctOOB(0),totalOOB(0){} - - void loadFirstNode(){ - nodeQueue.emplace_back(fpSingleton::getSingleton().returnNumObservations()); - } - - inline bool shouldProcessNode(){ - if(nodeQueue.back().returnNodeImpurity() < std::numeric_limits::epsilon()) - return false; - if(nodeQueue.back().returnInSampleSize() <= fpSingleton::getSingleton().returnMinParent()) - return false; - if(nodeQueue.back().returnDepth() >= fpSingleton::getSingleton().returnMaxDepth()) - return false; - return true; - } - - inline float returnOOB(){ - return correctOOB/totalOOB; - } - - inline int returnLastNodeID(){ - return tree.size()-1; - } - - inline void linkParentToChild(){ - if(nodeQueue.back().returnIsLeftNode()){ - tree[nodeQueue.back().returnParentID()].setLeftValue(returnLastNodeID()); - }else{ - tree[nodeQueue.back().returnParentID()].setRightValue(returnLastNodeID()); - } - } - - - inline int returnMaxDepth(){ - int maxDepth=0; - for(auto &nodes : tree){ - if(maxDepth < nodes.returnDepth()){ - maxDepth = nodes.returnDepth(); - } - } - return maxDepth; - } - - inline int returnNumLeafNodes(){ - int numLeafNodes=0; - for(auto nodes : tree){ - if(!nodes.isInternalNode()){ - ++numLeafNodes; - } - } - return numLeafNodes; - } - - inline void updateSimMat(std::map > &simMat, std::map, double> &pairMat){ - for(auto nodes : leafNodes){ - stratifiedInNodeClassIndicesUnsupervised* obsI = nodes.returnObsIndices(); - std::vector leafObs; - std::vector leafObsOut; - leafObs = obsI->returnInSampsVec(); - leafObsOut = obsI->returnOutSampsVec(); - auto siz = leafObs.size(); - if (siz <= 0) - continue; - for(unsigned int i = 0; i < siz; ++i) { - for (unsigned int j=0; j<=i; ++j) { - std::pair pair1 = std::make_pair(leafObs[i], leafObs[j]); - - if(pairMat.count(pair1) > 0){ - #pragma omp critical - { - pairMat[pair1]++; - } - } - else{ - #pragma omp critical - { - pairMat.insert({pair1, 1}); - } - } - } - } - } - } - inline void updateSimMatOut(std::map > &simMat, std::map, double> &pairMat){ - for(auto nodes : leafNodes){ - stratifiedInNodeClassIndicesUnsupervised* obsI = nodes.returnObsIndices(); - std::vector leafObs; - - leafObs = obsI->returnOutSampsVec(); - auto siz = leafObs.size(); - if (siz <= 0) - continue; - - for(unsigned int i = 0; i < siz; ++i) { - for (unsigned int j=0; j<=i; ++j) { - std::pair pair1 = std::make_pair(leafObs[i], leafObs[j]); - #pragma omp critical - { - auto it = pairMat.find(pair1); - if(it!=pairMat.end()) - pairMat[pair1]++; - else - pairMat.insert({pair1, 1}); - } - } - } - } - } - - inline int returnLeafDepthSum(){ - int leafDepthSums=0; - for(auto nodes : tree){ - if(!nodes.isInternalNode()){ - leafDepthSums += nodes.returnDepth(); - } - } - return leafDepthSums; - } - - - inline void setAsLeaf(){ - tree.back().setDepth(nodeQueue.back().returnDepth()); - } - - - inline void makeWholeNodeALeaf(){ - tree.emplace_back(); - linkParentToChild(); - setAsLeaf(); - leafNodes.emplace_back(nodeQueue.back()); - nodeQueue.pop_back(); - } - - void printTree(){ - for(auto nd : tree){ - nd.printNode(); - } - } - - inline void createNodeInTree(){ - tree.emplace_back(); - linkParentToChild(); - tree.back().setCutValue(nodeQueue.back().returnBestCutValue()); - tree.back().setFeatureValue(nodeQueue.back().returnBestFeature()); - tree.back().setDepth(nodeQueue.back().returnDepth()); - } - - - inline void makeNodeInternal(){ - createNodeInTree(); - createChildren(); - } - - inline bool isLeftNode(){ - return true; - } - - inline bool isRightNode(){ - return false; - } - - inline void createChildren(){ - nodeQueue.back().moveDataLeftOrRight(); - - stratifiedInNodeClassIndicesUnsupervised* leftIndices = nodeQueue.back().returnLeftIndices(); - stratifiedInNodeClassIndicesUnsupervised* rightIndices = nodeQueue.back().returnRightIndices(); - - assert(leftIndices->returnInSampleSize() > 0); - assert(rightIndices->returnInSampleSize() > 0); - - int childDepth = nodeQueue.back().returnDepth()+1; - - nodeQueue.pop_back(); - - nodeQueue.emplace_back(returnLastNodeID(),childDepth, isLeftNode()); - nodeQueue.back().loadIndices(leftIndices); - - nodeQueue.emplace_back(returnLastNodeID(),childDepth, isRightNode()); - nodeQueue.back().loadIndices(rightIndices); - } - - - inline void findTheBestSplit(){ - nodeQueue.back().findBestSplit(); - } - - - inline bool noGoodSplitFound(){ - if(nodeQueue.back().returnBestImpurity() == -1) - return true; - return nodeQueue.back().returnBestFeature() == -1; - } - - - inline void processANode(){ - nodeQueue.back().setupNode(); - if(shouldProcessNode()){ - findTheBestSplit(); - if(noGoodSplitFound()){ - makeWholeNodeALeaf(); - }else{ - makeNodeInternal(); - } - }else{ - makeWholeNodeALeaf(); - } - } - - inline void processNodes(){ - while(!nodeQueue.empty()){ - processANode(); - } - } - - inline void growTree(){ - loadFirstNode(); - processNodes(); - } - - }; - -}//fp -#endif //urfTree_h From a50b0df20d163ee80f9db24c58da27a6fdcbee69 Mon Sep 17 00:00:00 2001 From: Giorgio <122452088+GiorgioMB@users.noreply.github.com> Date: Fri, 3 May 2024 14:41:55 +0200 Subject: [PATCH 2/3] Update fpForestFactory.h --- packedForest/src/baseFunctions/fpForestFactory.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packedForest/src/baseFunctions/fpForestFactory.h b/packedForest/src/baseFunctions/fpForestFactory.h index 307b02a0..a3a35441 100644 --- a/packedForest/src/baseFunctions/fpForestFactory.h +++ b/packedForest/src/baseFunctions/fpForestFactory.h @@ -5,8 +5,6 @@ #include "fpForestBase.h" #include "weightedFeature.h" #include "../forestTypes/basicForests/rerf/fpRerFBase.h" -#include "../forestTypes/unsupervisedForests/urf/fpURFBase.h" -#include "../forestTypes/unsupervisedForests/urerf/fpURerFBase.h" #include "../forestTypes/basicForests/rfClassification/fpForestClassificationBase.h" #include "../forestTypes/binnedTree/binnedBase.h" @@ -21,10 +19,6 @@ namespace fp{ return std::unique_ptr >{new fpForestClassificationBase}; }else if(parameterName == "rerf"){ return std::unique_ptr >{new fpRerFBase}; - }else if(parameterName == "urf"){ - return std::unique_ptr >{new fpURFBase}; - }else if(parameterName == "urerf"){ - return std::unique_ptr >{new fpURerFBase}; }else if(parameterName == "binnedBase"){ return std::unique_ptr >{new binnedBase}; }else if(parameterName == "binnedBaseRerF"){ From dc343b32d2a942f7f61d32e0703c1e3e1e2b8684 Mon Sep 17 00:00:00 2001 From: Giorgio <122452088+GiorgioMB@users.noreply.github.com> Date: Fri, 3 May 2024 14:47:40 +0200 Subject: [PATCH 3/3] Delete Python/rerf/urerf.py --- Python/rerf/urerf.py | 256 ------------------------------------------- 1 file changed, 256 deletions(-) delete mode 100644 Python/rerf/urerf.py diff --git a/Python/rerf/urerf.py b/Python/rerf/urerf.py deleted file mode 100644 index b70b2748..00000000 --- a/Python/rerf/urerf.py +++ /dev/null @@ -1,256 +0,0 @@ -import multiprocessing - -import numpy as np -from scipy.sparse import csr_matrix -from sklearn.base import BaseEstimator -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted - -import pyfp - - -class UnsupervisedRandomForest(BaseEstimator): - """Unsupervised random(er) forest - - Supports both Random Forest, developed by Breiman (2001) [#Breiman]_, as well as - Randomer Forest or Random Projection Forests (RerF) developed by - Tomita et al. (2016) [#Tomita]_. - - The difference between the two algorithms is where the random linear - combinations occur: Random Forest combines features at the tree level - whereas RerF combines features at the node level. - - In addition to the normal RandomForestClassifier parameters, there are - two parameters to be aware of: - - - ``projection_matrix`` - - ``feature_combinations`` - - Parameters - ---------- - projection_matrix : str, optional (default: "RerF") - The random combination of features to use: either "RerF", "Base". - "RerF" randomly combines features for each `mtry`. Base is our - implementation of Random Forest. "S-RerF" is structured RerF, - combining multiple features together in random patches. - See Tomita et al. (2016) [#Tomita]_ for further details. - n_estimators : int, optional (default: 100) - Number of trees in forest. - max_depth : int or None, optional (default=None) - The maximum depth of the tree. If None, then nodes are expanded - until all leaves are pure or until all leaves contain less than - min_samples_split samples. - min_samples_split : int, optional (default: "auto") - The minimum splittable node size. A node size < ``min_samples_split`` - will be a leaf node. Note: other implementations called `min.parent` - or `minParent` - - - If "auto", then ``min_samples_split=sqrt(num_obs)`` - - If int, then consider ``min_samples_split`` at each split. - - max_features : int, float, string, or None, optional (default="auto") - The number of features or feature combinations to consider when - looking for the best split. Note: also called `mtry` or `d`. - - - If int, then consider ``max_features`` features or feature combinations at each split. - - If float, then `max_features` is a fraction and ``int(max_features * n_features)`` features are considered at each split. - - If "auto", then ``max_features=sqrt(n_features)``. - - If "sqrt", then ``max_features=sqrt(n_features)`` (same as "auto"). - - If "log2", then ``max_features=log2(n_features)``. - - If None, then ``max_features=n_features``. - - feature_combinations : float, optional (default: "auto") - Average number of features combined to form a new feature when - using "RerF." Otherwise, ignored. - - - If int or float, then ``feature_combinations`` is average number of features to combine for each ``max_features`` to try. - - If "auto", then ``feature_combinations=n_features``. - - If "sqrt", then ``feature_combinations=sqrt(n_features)`` (same as "auto"). - - If "log2", then ``feature_combinations=log2(n_features)``. - - If None, then ``feature_combinations=n_features``. - - n_jobs : int or None, optional (default=None) - The number of jobs to run in parallel for both `fit` and `predict`. - ``None`` means 1. ``-1`` means use all processors. - random_state : int or None, optional (default=None) - Random seed to use. If None, set seed to ``np.random.randint(1, 1000000)``. - - Returns - ------- - - Examples - -------- - >>> from matplotlib import pyplot as plt - >>> from sklearn.cluster import AgglomerativeClustering - >>> from sklearn.datasets import make_classification - >>> from sklearn.metrics import adjusted_rand_score - >>> from rerf.urerf import UnsupervisedRandomForest - - >>> X, y = make_classification( - ... n_samples=1000, - ... n_features=4, - ... n_informative=2, - ... n_redundant=0, - ... random_state=0, - ... shuffle=False, - ... ) - >>> clf = UnsupervisedRandomForest(n_estimators=100, random_state=0) - >>> clf.fit(X) - >>> sim_mat = clf.transform() - >>> plt.imshow(sim_mat) - >>> cluster = AgglomerativeClustering(n_clusters=2) - >>> predict_labels = cluster.fit_predict(sim_mat) - >>> score = adjusted_rand_score(y, predict_labels) - >>> print(score) - 0.7601439767776818 - - Notes - ----- - - """ - - def __init__( - self, - projection_matrix="RerF", - n_estimators=100, - max_depth=None, - min_samples_split="auto", - max_features="auto", - feature_combinations="auto", - n_jobs=None, - random_state=None, - ): - self.projection_matrix = projection_matrix - self.n_estimators = n_estimators - self.max_depth = max_depth - self.min_samples_split = min_samples_split - self.max_features = max_features - self.feature_combinations = feature_combinations - self.n_jobs = n_jobs - self.random_state = random_state - - def fit(self, X, y=None): - """Fit estimator. - Parameters - ---------- - X : array-like, shape=(n_samples, n_features) - Input data. Rows are observations and columns are features. - - Returns - ------- - self : object - - """ - - # Check that X and y have correct shape - X = check_array(X, accept_sparse=["csc"]) - num_features = X.shape[1] - num_obs = X.shape[0] - - # setup the forest's parameters - self.forest_ = pyfp.fpForest() - - if self.projection_matrix == "Base": - forestType = "urf" - elif self.projection_matrix == "RerF": - forestType = "urerf" - else: - raise ValueError("Incorrect projection matrix") - self.forest_.setParameter("forestType", forestType) - - self.forest_.setParameter("numTreesInForest", self.n_estimators) - - # if max_depth is not set, C++ sets to maximum integer size - if self.max_depth is not None: - self.forest_.setParameter("maxDepth", self.max_depth) - - if self.min_samples_split == "auto": - self.min_samples_split_ = round(num_obs ** (1 / 2)) - else: - self.min_samples_split_ = self.min_samples_split - self.forest_.setParameter("minParent", self.min_samples_split_) - - if self.n_jobs is None: - self.n_jobs_ = 1 - elif self.n_jobs == -1: - self.n_jobs_ = multiprocessing.cpu_count() - else: - self.n_jobs_ = self.n_jobs - self.forest_.setParameter("numCores", self.n_jobs_) - - if self.random_state is None: - self.random_state_ = np.random.randint(1, 1000000) - else: - self.random_state_ = self.random_state - self.forest_.setParameter("seed", self.random_state_) - - # need to set mtry here (using max_features and calc num_features): - if self.max_features in ("auto", "sqrt"): - self.mtry_ = int(num_features ** (1 / 2)) - elif self.max_features is None: - self.mtry_ = num_features - elif self.max_features == "log2": - self.mtry_ = int(np.log2(num_features)) - elif isinstance(self.max_features, int): - self.mtry_ = self.max_features - elif isinstance(self.max_features, float) and 0 <= self.max_features <= 1: - self.mtry_ = int(self.max_features * num_features) - else: - raise ValueError("max_features has unexpected value") - self.forest_.setParameter("mtry", self.mtry_) - - if self.feature_combinations == "auto": - self.feature_combinations_ = num_features - elif self.feature_combinations == "sqrt": - self.feature_combinations_ = num_features ** (1 / 2) - elif self.feature_combinations is None: - self.feature_combinations_ = num_features - elif self.feature_combinations == "log2": - self.feature_combinations_ = np.log2(num_features) - elif isinstance(self.feature_combinations, (int, float)): - self.feature_combinations_ = self.feature_combinations - else: - raise ValueError("feature_combinations_ has unexpected value") - self.forest_.setParameter("mtryMult", self.feature_combinations_) - - # Explicitly setting for numpy input - self.forest_.setParameter("useRowMajor", 1) - - # y will be ignored - y = np.zeros(num_obs) - self.forest_._growForestnumpy(X, y, num_obs, num_features) - - self.X_ = X - - return self - - def transform(self, return_sparse=False): - """Transform dataset into an affinity matrix / similarity matrix. - - Parameters - ---------- - - Returns - ------- - affinity_matrix : sparse matrix, shape=(n_samples, n_samples) - """ - - check_is_fitted(self, "forest_") - - pair_mat = self.forest_._return_pair_mat() - - sparse_mat = pair_mat_to_sparse(pair_mat, self.X_.shape[0], self.n_estimators) - - if return_sparse: - return sparse_mat - else: - return sparse_mat.toarray() - - -def pair_mat_to_sparse(pair_mat, n_obs, n_estimators): - i = [ij[0] for ij in pair_mat.keys()] - j = [ij[1] for ij in pair_mat.keys()] - data = [d / n_estimators for d in pair_mat.values()] - sparse_mat = csr_matrix((data, (i, j)), shape=(n_obs, n_obs), dtype=float) - - return sparse_mat