StochTree 0.1.1
Loading...
Searching...
No Matches
partition_tracker.h
1
25#ifndef STOCHTREE_PARTITION_TRACKER_H_
26#define STOCHTREE_PARTITION_TRACKER_H_
27
28#include <stochtree/data.h>
29#include <stochtree/ensemble.h>
30#include <stochtree/log.h>
31#include <stochtree/openmp_utils.h>
32#include <stochtree/tree.h>
33
34#include <numeric>
35#include <vector>
36
37namespace StochTree {
38
40class SampleNodeMapper;
41class SamplePredMapper;
42class UnsortedNodeSampleTracker;
43class SortedNodeSampleTracker;
44class FeaturePresortRootContainer;
45
48 public:
57 ForestTracker(Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types, int num_trees, int num_observations);
59 void ReconstituteFromForest(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
60 void AssignAllSamplesToRoot();
61 void AssignAllSamplesToRoot(int32_t tree_num);
62 void AssignAllSamplesToConstantPrediction(double value);
63 void AssignAllSamplesToConstantPrediction(int32_t tree_num, double value);
64 void UpdatePredictions(TreeEnsemble* ensemble, ForestDataset& dataset);
65 void UpdateSampleTrackers(TreeEnsemble& forest, ForestDataset& dataset);
66 void UpdateSampleTrackersResidual(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
67 void ResetRoot(Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types, int32_t tree_num);
68 void AddSplit(Eigen::MatrixXd& covariates, TreeSplit& split, int32_t split_feature, int32_t tree_id, int32_t split_node_id, int32_t left_node_id, int32_t right_node_id, bool keep_sorted = false, int num_threads = -1);
69 void RemoveSplit(Eigen::MatrixXd& covariates, Tree* tree, int32_t tree_id, int32_t split_node_id, int32_t left_node_id, int32_t right_node_id, bool keep_sorted = false);
70 double GetSamplePrediction(data_size_t sample_id);
71 double GetTreeSamplePrediction(data_size_t sample_id, int tree_id);
72 void UpdateVarWeightsFromInternalPredictions(ForestDataset& dataset);
73 void SetSamplePrediction(data_size_t sample_id, double value);
74 void SetTreeSamplePrediction(data_size_t sample_id, int tree_id, double value);
75 void SyncPredictions();
76 data_size_t GetNodeId(int observation_num, int tree_num);
77 data_size_t UnsortedNodeBegin(int tree_id, int node_id);
78 data_size_t UnsortedNodeEnd(int tree_id, int node_id);
79 data_size_t UnsortedNodeSize(int tree_id, int node_id);
80 data_size_t SortedNodeBegin(int node_id, int feature_id);
81 data_size_t SortedNodeEnd(int node_id, int feature_id);
82 data_size_t SortedNodeSize(int node_id, int feature_id);
83 std::vector<data_size_t>::iterator UnsortedNodeBeginIterator(int tree_id, int node_id);
84 std::vector<data_size_t>::iterator UnsortedNodeEndIterator(int tree_id, int node_id);
85 std::vector<data_size_t>::iterator SortedNodeBeginIterator(int node_id, int feature_id);
86 std::vector<data_size_t>::iterator SortedNodeEndIterator(int node_id, int feature_id);
87 SamplePredMapper* GetSamplePredMapper() {return sample_pred_mapper_.get();}
88 SampleNodeMapper* GetSampleNodeMapper() {return sample_node_mapper_.get();}
89 UnsortedNodeSampleTracker* GetUnsortedNodeSampleTracker() {return unsorted_node_sample_tracker_.get();}
90 SortedNodeSampleTracker* GetSortedNodeSampleTracker() {return sorted_node_sample_tracker_.get();}
91 int GetNumObservations() {return num_observations_;}
92 int GetNumTrees() {return num_trees_;}
93 int GetNumFeatures() {return num_features_;}
94 bool Initialized() {return initialized_;}
95
96 private:
98 std::vector<double> sum_predictions_;
100 std::unique_ptr<SamplePredMapper> sample_pred_mapper_;
102 std::unique_ptr<SampleNodeMapper> sample_node_mapper_;
106 std::unique_ptr<UnsortedNodeSampleTracker> unsorted_node_sample_tracker_;
110 std::unique_ptr<FeaturePresortRootContainer> presort_container_;
111 std::unique_ptr<SortedNodeSampleTracker> sorted_node_sample_tracker_;
112 std::vector<FeatureType> feature_types_;
113 int num_trees_;
114 int num_observations_;
115 int num_features_;
116 bool initialized_{false};
117
118 void UpdatePredictionsInternal(TreeEnsemble* ensemble, Eigen::MatrixXd& covariates, Eigen::MatrixXd& basis);
119 void UpdatePredictionsInternal(TreeEnsemble* ensemble, Eigen::MatrixXd& covariates);
120 void UpdateSampleTrackersInternal(TreeEnsemble& forest, Eigen::MatrixXd& covariates, Eigen::MatrixXd& basis);
121 void UpdateSampleTrackersInternal(TreeEnsemble& forest, Eigen::MatrixXd& covariates);
122 void UpdateSampleTrackersResidualInternalBasis(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
123 void UpdateSampleTrackersResidualInternalNoBasis(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
124};
125
128 public:
129 SamplePredMapper(int num_trees, data_size_t num_observations) {
130 num_trees_ = num_trees;
131 num_observations_ = num_observations;
132 // Initialize the vector of vectors of leaf indices for each tree
133 tree_preds_.resize(num_trees_);
134 for (int j = 0; j < num_trees_; j++) {
135 tree_preds_[j].resize(num_observations_);
136 }
137 }
138
139 inline double GetPred(data_size_t sample_id, int tree_id) {
140 CHECK_LT(sample_id, num_observations_);
141 CHECK_LT(tree_id, num_trees_);
142 return tree_preds_[tree_id][sample_id];
143 }
144
145 inline void SetPred(data_size_t sample_id, int tree_id, double value) {
146 CHECK_LT(sample_id, num_observations_);
147 CHECK_LT(tree_id, num_trees_);
148 tree_preds_[tree_id][sample_id] = value;
149 }
150
151 inline int NumTrees() {return num_trees_;}
152
153 inline int NumObservations() {return num_observations_;}
154
155 inline void AssignAllSamplesToConstantPrediction(int tree_id, double value) {
156 for (data_size_t i = 0; i < num_observations_; i++) {
157 tree_preds_[tree_id][i] = value;
158 }
159 }
160
161 private:
162 std::vector<std::vector<double>> tree_preds_;
163 int num_trees_;
164 data_size_t num_observations_;
165};
166
169 public:
170 SampleNodeMapper(int num_trees, data_size_t num_observations) {
171 num_trees_ = num_trees;
172 num_observations_ = num_observations;
173 // Initialize the vector of vectors of leaf indices for each tree
174 tree_observation_indices_.resize(num_trees_);
175 for (int j = 0; j < num_trees_; j++) {
176 tree_observation_indices_[j].resize(num_observations_);
177 }
178 }
179
181 num_trees_ = other.NumTrees();
182 num_observations_ = other.NumObservations();
183 // Initialize the vector of vectors of leaf indices for each tree
184 tree_observation_indices_.resize(num_trees_);
185 for (int j = 0; j < num_trees_; j++) {
186 tree_observation_indices_[j].resize(num_observations_);
187 for (int i = 0; i < num_observations_; i++) {
188 tree_observation_indices_[j][i] = other.GetNodeId(i, j);
189 }
190 }
191 }
192
193 void AddSplit(Eigen::MatrixXd& covariates, TreeSplit& split, int32_t split_feature, int32_t tree_id, int32_t split_node_id, int32_t left_node_id, int32_t right_node_id) {
194 CHECK_EQ(num_observations_, covariates.rows());
195 // Eigen::MatrixXd X = covariates.GetData();
196 for (int i = 0; i < num_observations_; i++) {
197 if (tree_observation_indices_[tree_id][i] == split_node_id) {
198 auto fvalue = covariates(i, split_feature);
199 if (split.SplitTrue(fvalue)) {
200 tree_observation_indices_[tree_id][i] = left_node_id;
201 } else {
202 tree_observation_indices_[tree_id][i] = right_node_id;
203 }
204 }
205 }
206 }
207
208 inline data_size_t GetNodeId(data_size_t sample_id, int tree_id) {
209 CHECK_LT(sample_id, num_observations_);
210 CHECK_LT(tree_id, num_trees_);
211 return tree_observation_indices_[tree_id][sample_id];
212 }
213
214 inline void SetNodeId(data_size_t sample_id, int tree_id, int node_id) {
215 CHECK_LT(sample_id, num_observations_);
216 CHECK_LT(tree_id, num_trees_);
217 tree_observation_indices_[tree_id][sample_id] = node_id;
218 }
219
220 inline int NumTrees() {return num_trees_;}
221
222 inline int NumObservations() {return num_observations_;}
223
224 inline void AssignAllSamplesToRoot(int tree_id) {
225 for (data_size_t i = 0; i < num_observations_; i++) {
226 tree_observation_indices_[tree_id][i] = 0;
227 }
228 }
229
230 private:
231 std::vector<std::vector<int>> tree_observation_indices_;
232 int num_trees_;
233 data_size_t num_observations_;
234};
235
238 public:
239 FeatureUnsortedPartition(data_size_t n);
240
243
245 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit& split);
246
248 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value);
249
251 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector<std::uint32_t> const& category_list);
252
254 void PruneNodeToLeaf(int node_id);
255
257 bool IsLeaf(int node_id);
258
260 bool IsValidNode(int node_id);
261
263 bool LeftNodeIsLeaf(int node_id);
264
266 bool RightNodeIsLeaf(int node_id);
267
269 data_size_t NodeBegin(int node_id);
270
272 data_size_t NodeEnd(int node_id);
273
275 data_size_t NodeSize(int node_id);
276
278 int Parent(int node_id);
279
281 int LeftNode(int node_id);
282
284 int RightNode(int node_id);
285
287 std::vector<data_size_t> indices_;
288
290 std::vector<data_size_t> NodeIndices(int node_id);
291
293 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper);
294
295 private:
296 // Vectors tracking indices in each node
297 std::vector<data_size_t> node_begin_;
298 std::vector<data_size_t> node_length_;
299 std::vector<int32_t> parent_nodes_;
300 std::vector<int32_t> left_nodes_;
301 std::vector<int32_t> right_nodes_;
302 int num_nodes_, num_deleted_nodes_;
303 std::vector<int> deleted_nodes_;
304
305 // Private helper functions
306 void ExpandNodeTrackingVectors(int node_id, int left_node_id, int right_node_id, data_size_t node_start_idx, data_size_t num_left, data_size_t num_right);
307 void ConvertLeafParentToLeaf(int node_id);
308};
309
312 public:
313 UnsortedNodeSampleTracker(data_size_t n, int num_trees) {
314 feature_partitions_.resize(num_trees);
315 num_trees_ = num_trees;
316 for (int i = 0; i < num_trees; i++) {
317 feature_partitions_[i].reset(new FeatureUnsortedPartition(n));
318 }
319 }
320
323
325 void PartitionTreeNode(Eigen::MatrixXd& covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit& split) {
326 return feature_partitions_[tree_id]->PartitionNode(covariates, node_id, left_node_id, right_node_id, feature_split, split);
327 }
328
330 void PartitionTreeNode(Eigen::MatrixXd& covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value) {
331 return feature_partitions_[tree_id]->PartitionNode(covariates, node_id, left_node_id, right_node_id, feature_split, split_value);
332 }
333
335 void PartitionTreeNode(Eigen::MatrixXd& covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector<std::uint32_t> const& category_list) {
336 return feature_partitions_[tree_id]->PartitionNode(covariates, node_id, left_node_id, right_node_id, feature_split, category_list);
337 }
338
340 void ResetTreeToRoot(int tree_id, data_size_t n) {
341 feature_partitions_[tree_id].reset(new FeatureUnsortedPartition(n));;
342 }
343
345 void PruneTreeNodeToLeaf(int tree_id, int node_id) {
346 return feature_partitions_[tree_id]->PruneNodeToLeaf(node_id);
347 }
348
350 bool IsLeaf(int tree_id, int node_id) {
351 return feature_partitions_[tree_id]->IsLeaf(node_id);
352 }
353
355 bool IsValidNode(int tree_id, int node_id) {
356 return feature_partitions_[tree_id]->IsValidNode(node_id);
357 }
358
360 bool LeftNodeIsLeaf(int tree_id, int node_id) {
361 return feature_partitions_[tree_id]->LeftNodeIsLeaf(node_id);
362 }
363
365 bool RightNodeIsLeaf(int tree_id, int node_id) {
366 return feature_partitions_[tree_id]->RightNodeIsLeaf(node_id);
367 }
368
370 data_size_t NodeBegin(int tree_id, int node_id) {
371 return feature_partitions_[tree_id]->NodeBegin(node_id);
372 }
373
375 data_size_t NodeEnd(int tree_id, int node_id) {
376 return feature_partitions_[tree_id]->NodeEnd(node_id);
377 }
378
379 std::vector<data_size_t>::iterator NodeBeginIterator(int tree_id, int node_id) {
380 data_size_t node_begin = feature_partitions_[tree_id]->NodeBegin(node_id);
381 auto begin_iter = feature_partitions_[tree_id]->indices_.begin();
382 return begin_iter + node_begin;
383 }
384
385 std::vector<data_size_t>::iterator NodeEndIterator(int tree_id, int node_id) {
386 int node_end = feature_partitions_[tree_id]->NodeEnd(node_id);
387 auto begin_iter = feature_partitions_[tree_id]->indices_.begin();
388 return begin_iter + node_end;
389 }
390
392 data_size_t NodeSize(int tree_id, int node_id) {
393 return feature_partitions_[tree_id]->NodeSize(node_id);
394 }
395
397 int Parent(int tree_id, int node_id) {
398 return feature_partitions_[tree_id]->Parent(node_id);
399 }
400
402 int LeftNode(int tree_id, int node_id) {
403 return feature_partitions_[tree_id]->LeftNode(node_id);
404 }
405
407 int RightNode(int tree_id, int node_id) {
408 return feature_partitions_[tree_id]->RightNode(node_id);
409 }
410
412 std::vector<data_size_t> TreeNodeIndices(int tree_id, int node_id) {
413 return feature_partitions_[tree_id]->NodeIndices(node_id);
414 }
415
417 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper) {
418 feature_partitions_[tree_id]->UpdateObservationMapping(node_id, tree_id, sample_node_mapper);
419 }
420
422 void UpdateObservationMapping(Tree* tree, int tree_id, SampleNodeMapper* sample_node_mapper) {
423 std::vector<int> leaves = tree->GetLeaves();
424 int leaf;
425 for (int i = 0; i < leaves.size(); i++) {
426 leaf = leaves[i];
427 UpdateObservationMapping(leaf, tree_id, sample_node_mapper);
428 }
429 }
430
432 int NumTrees() { return num_trees_; }
433
435 FeatureUnsortedPartition* GetFeaturePartition(int i) { return feature_partitions_[i].get(); }
436
437 private:
438 // Vectors of feature partitions
439 std::vector<std::unique_ptr<FeatureUnsortedPartition>> feature_partitions_;
440 int num_trees_;
441};
442
445 public:
446 NodeOffsetSize(data_size_t node_offset, data_size_t node_size) : node_begin_{node_offset}, node_size_{node_size}, presorted_{false} {
447 node_end_ = node_begin_ + node_size_;
448 }
449
450 ~NodeOffsetSize() {}
451
452 void SetSorted() {presorted_ = true;}
453
454 bool IsSorted() {return presorted_;}
455
456 data_size_t Begin() {return node_begin_;}
457
458 data_size_t End() {return node_end_;}
459
460 data_size_t Size() {return node_size_;}
461
462 private:
463 data_size_t node_begin_;
464 data_size_t node_size_;
465 data_size_t node_end_;
466 bool presorted_;
467};
468
471
483 public:
484 FeaturePresortRoot(Eigen::MatrixXd& covariates, int32_t feature_index, FeatureType feature_type) {
485 feature_index_ = feature_index;
486 ArgsortRoot(covariates);
487 }
488
490
491 void ArgsortRoot(Eigen::MatrixXd& covariates) {
492 data_size_t num_obs = covariates.rows();
493
494 // Make a vector of indices from 0 to num_obs - 1
495 if (feature_sort_indices_.size() != num_obs){
496 feature_sort_indices_.resize(num_obs, 0);
497 }
498 std::iota(feature_sort_indices_.begin(), feature_sort_indices_.end(), 0);
499
500 // Define a custom comparator to be used with stable_sort:
501 // For every two indices l and r store as elements of `data_sort_indices_`,
502 // compare them for sorting purposes by indexing the covariate's raw data with both l and r
503 auto comp_op = [&](size_t const &l, size_t const &r) { return std::less<double>{}(covariates(l, feature_index_), covariates(r, feature_index_)); };
504 std::stable_sort(feature_sort_indices_.begin(), feature_sort_indices_.end(), comp_op);
505 }
506
507 private:
508 std::vector<data_size_t> feature_sort_indices_;
509 int32_t feature_index_;
510};
511
514 public:
515 FeaturePresortRootContainer(Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types) {
516 num_features_ = covariates.cols();
517 feature_presort_.resize(num_features_);
518 for (int i = 0; i < num_features_; i++) {
519 feature_presort_[i].reset(new FeaturePresortRoot(covariates, i, feature_types[i]));
520 }
521 }
522
524
525 FeaturePresortRoot* GetFeaturePresort(int feature_num) {return feature_presort_[feature_num].get(); }
526
527 private:
528 std::vector<std::unique_ptr<FeaturePresortRoot>> feature_presort_;
529 int num_features_;
530};
531
543 public:
544 FeaturePresortPartition(FeaturePresortRoot* feature_presort_root, Eigen::MatrixXd& covariates, int32_t feature_index, FeatureType feature_type) {
545 // Unpack all feature details
546 feature_index_ = feature_index;
547 feature_type_ = feature_type;
548 num_obs_ = covariates.rows();
549 feature_sort_indices_ = feature_presort_root->feature_sort_indices_;
550
551 // Initialize new tree to root
552 data_size_t node_offset = 0;
553 node_offset_sizes_.emplace_back(node_offset, num_obs_);
554 }
555
557
559 void SplitFeature(Eigen::MatrixXd& covariates, int32_t node_id, int32_t feature_index, TreeSplit& split);
560
562 void SplitFeatureNumeric(Eigen::MatrixXd& covariates, int32_t node_id, int32_t feature_index, double split_value);
563
565 void SplitFeatureCategorical(Eigen::MatrixXd& covariates, int32_t node_id, int32_t feature_index, std::vector<std::uint32_t> const& category_list);
566
568 data_size_t NodeBegin(int32_t node_id) {return node_offset_sizes_[node_id].Begin();}
569
571 data_size_t NodeEnd(int32_t node_id) {return node_offset_sizes_[node_id].End();}
572
574 data_size_t NodeSize(int32_t node_id) {return node_offset_sizes_[node_id].Size();}
575
577 std::vector<data_size_t> NodeIndices(int node_id);
578
580 data_size_t SortIndex(data_size_t j) {return feature_sort_indices_[j];}
581
583 FeatureType GetFeatureType() {return feature_type_;}
584
586 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper);
587
589 std::vector<data_size_t> feature_sort_indices_;
590 private:
592 void AddLeftRightNodes(data_size_t left_node_begin, data_size_t left_node_size, data_size_t right_node_begin, data_size_t right_node_size);
593
595 std::vector<NodeOffsetSize> node_offset_sizes_;
596 int32_t feature_index_;
597 FeatureType feature_type_;
598 data_size_t num_obs_;
599};
600
603 public:
604 SortedNodeSampleTracker(FeaturePresortRootContainer* feature_presort_root_container, Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types) {
605 num_features_ = covariates.cols();
606 feature_partitions_.resize(num_features_);
607 FeaturePresortRoot* feature_presort_root;
608 for (int i = 0; i < num_features_; i++) {
609 feature_presort_root = feature_presort_root_container->GetFeaturePresort(i);
610 feature_partitions_[i].reset(new FeaturePresortPartition(feature_presort_root, covariates, i, feature_types[i]));
611 }
612 }
613
615 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int feature_split, TreeSplit& split, int num_threads = -1) {
616 StochTree::ParallelFor(0, num_features_, num_threads, [&](int i) {
617 feature_partitions_[i]->SplitFeature(covariates, node_id, feature_split, split);
618 });
619 }
620
622 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int feature_split, double split_value, int num_threads = -1) {
623 StochTree::ParallelFor(0, num_features_, num_threads, [&](int i) {
624 feature_partitions_[i]->SplitFeatureNumeric(covariates, node_id, feature_split, split_value);
625 });
626 }
627
629 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int feature_split, std::vector<std::uint32_t> const& category_list, int num_threads = -1) {
630 StochTree::ParallelFor(0, num_features_, num_threads, [&](int i) {
631 feature_partitions_[i]->SplitFeatureCategorical(covariates, node_id, feature_split, category_list);
632 });
633 }
634
636 data_size_t NodeBegin(int node_id, int feature_index) {
637 return feature_partitions_[feature_index]->NodeBegin(node_id);
638 }
639
641 data_size_t NodeEnd(int node_id, int feature_index) {
642 return feature_partitions_[feature_index]->NodeEnd(node_id);
643 }
644
646 data_size_t NodeSize(int node_id, int feature_index) {
647 return feature_partitions_[feature_index]->NodeSize(node_id);
648 }
649
650 std::vector<data_size_t>::iterator NodeBeginIterator(int node_id, int feature_index) {
651 data_size_t node_begin = NodeBegin(node_id, feature_index);
652 auto begin_iter = feature_partitions_[feature_index]->feature_sort_indices_.begin();
653 return begin_iter + node_begin;
654 }
655
656 std::vector<data_size_t>::iterator NodeEndIterator(int node_id, int feature_index) {
657 data_size_t node_end = NodeEnd(node_id, feature_index);
658 auto begin_iter = feature_partitions_[feature_index]->feature_sort_indices_.begin();
659 return begin_iter + node_end;
660 }
661
663 std::vector<data_size_t> NodeIndices(int node_id, int feature_index) {
664 return feature_partitions_[feature_index]->NodeIndices(node_id);
665 }
666
668 data_size_t SortIndex(data_size_t j, int feature_index) {return feature_partitions_[feature_index]->SortIndex(j); }
669
671 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper, int feature_index = 0) {
672 feature_partitions_[feature_index]->UpdateObservationMapping(node_id, tree_id, sample_node_mapper);
673 }
674
675 private:
676 std::vector<std::unique_ptr<FeaturePresortPartition>> feature_partitions_;
677 int num_features_;
678};
679
680} // namespace StochTree
681
682#endif // STOCHTREE_PARTITION_TRACKER_H_
Internal wrapper around Eigen::VectorXd interface for univariate floating point data....
Definition data.h:193
Data structure that tracks pre-sorted feature values through a tree's split lifecycle.
Definition partition_tracker.h:542
void SplitFeature(Eigen::MatrixXd &covariates, int32_t node_id, int32_t feature_index, TreeSplit &split)
Split numeric / ordered categorical feature and update sort indices.
data_size_t NodeEnd(int32_t node_id)
End position of node indexed by node_id.
Definition partition_tracker.h:571
std::vector< data_size_t > NodeIndices(int node_id)
Data indices for a given node.
std::vector< data_size_t > feature_sort_indices_
Feature sort indices.
Definition partition_tracker.h:589
data_size_t NodeSize(int32_t node_id)
Size (in observations) of node indexed by node_id.
Definition partition_tracker.h:574
FeatureType GetFeatureType()
Feature type.
Definition partition_tracker.h:583
data_size_t SortIndex(data_size_t j)
Feature sort index j.
Definition partition_tracker.h:580
data_size_t NodeBegin(int32_t node_id)
Start position of node indexed by node_id.
Definition partition_tracker.h:568
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in node_id.
void SplitFeatureNumeric(Eigen::MatrixXd &covariates, int32_t node_id, int32_t feature_index, double split_value)
Split numeric / ordered categorical feature and update sort indices.
void SplitFeatureCategorical(Eigen::MatrixXd &covariates, int32_t node_id, int32_t feature_index, std::vector< std::uint32_t > const &category_list)
Split unordered categorical feature and update sort indices.
Container class for FeaturePresortRoot objects stored for every feature in a dataset.
Definition partition_tracker.h:513
Data structure for presorting a feature by its values.
Definition partition_tracker.h:481
Mapping nodes to the indices they contain.
Definition partition_tracker.h:237
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value)
Partition a node based on a new split rule.
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit &split)
Partition a node based on a new split rule.
int RightNode(int node_id)
Right child of node_id.
int Parent(int node_id)
Parent node_id.
data_size_t NodeEnd(int node_id)
One past the last index of data points contained in node_id.
std::vector< data_size_t > indices_
Data indices.
Definition partition_tracker.h:287
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector< std::uint32_t > const &category_list)
Partition a node based on a new split rule.
bool RightNodeIsLeaf(int node_id)
Whether node_id's right child is a leaf.
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in node_id.
data_size_t NodeSize(int node_id)
Number of data points contained in node_id.
std::vector< data_size_t > NodeIndices(int node_id)
Data indices for a given node.
void ReconstituteFromTree(Tree &tree, ForestDataset &dataset)
Reconstitute a tree partition tracker from root based on a tree.
bool IsLeaf(int node_id)
Whether node_id is a leaf.
void PruneNodeToLeaf(int node_id)
Convert a (currently split) node to a leaf.
bool LeftNodeIsLeaf(int node_id)
Whether node_id's left child is a leaf.
int LeftNode(int node_id)
Left child of node_id.
bool IsValidNode(int node_id)
Whether node_id is a valid node.
data_size_t NodeBegin(int node_id)
First index of data points contained in node_id.
API for loading and accessing data used to sample tree ensembles The covariates / bases / weights use...
Definition data.h:271
"Superclass" wrapper around tracking data structures for forest sampling algorithms
Definition partition_tracker.h:47
ForestTracker(Eigen::MatrixXd &covariates, std::vector< FeatureType > &feature_types, int num_trees, int num_observations)
Construct a new ForestTracker object.
Tracking cutpoints available at a given node.
Definition partition_tracker.h:444
Class storing sample-node map for each tree in an ensemble.
Definition partition_tracker.h:168
Class storing sample-prediction map for each tree in an ensemble.
Definition partition_tracker.h:127
Data structure for tracking observations through a tree partition with each feature pre-sorted.
Definition partition_tracker.h:602
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int feature_split, TreeSplit &split, int num_threads=-1)
Partition a node based on a new split rule.
Definition partition_tracker.h:615
data_size_t NodeSize(int node_id, int feature_index)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:646
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper, int feature_index=0)
Update SampleNodeMapper for all the observations in node_id.
Definition partition_tracker.h:671
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int feature_split, double split_value, int num_threads=-1)
Partition a node based on a new split rule.
Definition partition_tracker.h:622
std::vector< data_size_t > NodeIndices(int node_id, int feature_index)
Data indices for a given node.
Definition partition_tracker.h:663
data_size_t SortIndex(data_size_t j, int feature_index)
Feature sort index j for feature_index.
Definition partition_tracker.h:668
data_size_t NodeBegin(int node_id, int feature_index)
First index of data points contained in node_id.
Definition partition_tracker.h:636
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int feature_split, std::vector< std::uint32_t > const &category_list, int num_threads=-1)
Partition a node based on a new split rule.
Definition partition_tracker.h:629
data_size_t NodeEnd(int node_id, int feature_index)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:641
Class storing a "forest," or an ensemble of decision trees.
Definition ensemble.h:31
Representation of arbitrary tree split rules, including numeric split rules (X[,i] <= c) and categori...
Definition tree.h:958
bool SplitTrue(double fvalue)
Whether a given covariate value is True or False on the rule defined by a TreeSplit object.
Definition tree.h:990
Decision tree data structure.
Definition tree.h:66
std::vector< std::int32_t > const & GetLeaves() const
Get indices of all leaf nodes.
Definition tree.h:564
Mapping nodes to the indices they contain.
Definition partition_tracker.h:311
void PartitionTreeNode(Eigen::MatrixXd &covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector< std::uint32_t > const &category_list)
Partition a node based on a new split rule.
Definition partition_tracker.h:335
void UpdateObservationMapping(Tree *tree, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in tree.
Definition partition_tracker.h:422
int RightNode(int tree_id, int node_id)
Right child of node_id.
Definition partition_tracker.h:407
bool IsLeaf(int tree_id, int node_id)
Whether node_id is a leaf.
Definition partition_tracker.h:350
bool RightNodeIsLeaf(int tree_id, int node_id)
Whether node_id's right child is a leaf.
Definition partition_tracker.h:365
bool IsValidNode(int tree_id, int node_id)
Whether node_id is a valid node.
Definition partition_tracker.h:355
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in node_id.
Definition partition_tracker.h:417
data_size_t NodeBegin(int tree_id, int node_id)
First index of data points contained in node_id.
Definition partition_tracker.h:370
int NumTrees()
Number of trees.
Definition partition_tracker.h:432
int LeftNode(int tree_id, int node_id)
Left child of node_id.
Definition partition_tracker.h:402
void ResetTreeToRoot(int tree_id, data_size_t n)
Convert a tree to root.
Definition partition_tracker.h:340
void PartitionTreeNode(Eigen::MatrixXd &covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit &split)
Partition a node based on a new split rule.
Definition partition_tracker.h:325
data_size_t NodeSize(int tree_id, int node_id)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:392
bool LeftNodeIsLeaf(int tree_id, int node_id)
Whether node_id's left child is a leaf.
Definition partition_tracker.h:360
int Parent(int tree_id, int node_id)
Parent node_id.
Definition partition_tracker.h:397
void ReconstituteFromForest(TreeEnsemble &forest, ForestDataset &dataset)
Reconstruct the node sample tracker based on the splits in a forest.
data_size_t NodeEnd(int tree_id, int node_id)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:375
void PartitionTreeNode(Eigen::MatrixXd &covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value)
Partition a node based on a new split rule.
Definition partition_tracker.h:330
FeatureUnsortedPartition * GetFeaturePartition(int i)
Return a pointer to the feature partition tracking tree i.
Definition partition_tracker.h:435
std::vector< data_size_t > TreeNodeIndices(int tree_id, int node_id)
Data indices for a given node.
Definition partition_tracker.h:412
void PruneTreeNodeToLeaf(int tree_id, int node_id)
Convert a (currently split) node to a leaf.
Definition partition_tracker.h:345
Definition category_tracker.h:36