StochTree 0.4.1.9000
Loading...
Searching...
No Matches
partition_tracker.h
1
25#ifndef STOCHTREE_PARTITION_TRACKER_H_
26#define STOCHTREE_PARTITION_TRACKER_H_
27
28#include <stochtree/data.h>
29#include <stochtree/ensemble.h>
30#include <stochtree/log.h>
31#include <stochtree/openmp_utils.h>
32#include <stochtree/tree.h>
33
34#include <numeric>
35
36namespace StochTree {
37
39class SampleNodeMapper;
40class SamplePredMapper;
41class UnsortedNodeSampleTracker;
42class SortedNodeSampleTracker;
43class FeaturePresortRootContainer;
44
47 public:
56 ForestTracker(Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types, int num_trees, int num_observations);
58 void ReconstituteFromForest(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
59 void AssignAllSamplesToRoot();
60 void AssignAllSamplesToRoot(int32_t tree_num);
61 void AssignAllSamplesToConstantPrediction(double value);
62 void AssignAllSamplesToConstantPrediction(int32_t tree_num, double value);
63 void UpdatePredictions(TreeEnsemble* ensemble, ForestDataset& dataset);
64 void UpdateSampleTrackers(TreeEnsemble& forest, ForestDataset& dataset);
65 void UpdateSampleTrackersResidual(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
66 void ResetRoot(Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types, int32_t tree_num);
67 void AddSplit(Eigen::MatrixXd& covariates, TreeSplit& split, int32_t split_feature, int32_t tree_id, int32_t split_node_id, int32_t left_node_id, int32_t right_node_id, bool keep_sorted = false, int num_threads = -1);
68 void RemoveSplit(Eigen::MatrixXd& covariates, Tree* tree, int32_t tree_id, int32_t split_node_id, int32_t left_node_id, int32_t right_node_id, bool keep_sorted = false);
69 double GetSamplePrediction(data_size_t sample_id);
70 double GetTreeSamplePrediction(data_size_t sample_id, int tree_id);
71 void UpdateVarWeightsFromInternalPredictions(ForestDataset& dataset);
72 void SetSamplePrediction(data_size_t sample_id, double value);
73 void SetTreeSamplePrediction(data_size_t sample_id, int tree_id, double value);
74 void SyncPredictions();
75 data_size_t GetNodeId(int observation_num, int tree_num);
76 data_size_t UnsortedNodeBegin(int tree_id, int node_id);
77 data_size_t UnsortedNodeEnd(int tree_id, int node_id);
78 data_size_t UnsortedNodeSize(int tree_id, int node_id);
79 data_size_t SortedNodeBegin(int node_id, int feature_id);
80 data_size_t SortedNodeEnd(int node_id, int feature_id);
81 data_size_t SortedNodeSize(int node_id, int feature_id);
82 std::vector<data_size_t>::iterator UnsortedNodeBeginIterator(int tree_id, int node_id);
83 std::vector<data_size_t>::iterator UnsortedNodeEndIterator(int tree_id, int node_id);
84 std::vector<data_size_t>::iterator SortedNodeBeginIterator(int node_id, int feature_id);
85 std::vector<data_size_t>::iterator SortedNodeEndIterator(int node_id, int feature_id);
86 SamplePredMapper* GetSamplePredMapper() {return sample_pred_mapper_.get();}
87 SampleNodeMapper* GetSampleNodeMapper() {return sample_node_mapper_.get();}
88 UnsortedNodeSampleTracker* GetUnsortedNodeSampleTracker() {return unsorted_node_sample_tracker_.get();}
89 SortedNodeSampleTracker* GetSortedNodeSampleTracker() {return sorted_node_sample_tracker_.get();}
90 int GetNumObservations() {return num_observations_;}
91 int GetNumTrees() {return num_trees_;}
92 int GetNumFeatures() {return num_features_;}
93 bool Initialized() {return initialized_;}
94 private:
96 std::vector<double> sum_predictions_;
98 std::unique_ptr<SamplePredMapper> sample_pred_mapper_;
100 std::unique_ptr<SampleNodeMapper> sample_node_mapper_;
104 std::unique_ptr<UnsortedNodeSampleTracker> unsorted_node_sample_tracker_;
108 std::unique_ptr<FeaturePresortRootContainer> presort_container_;
109 std::unique_ptr<SortedNodeSampleTracker> sorted_node_sample_tracker_;
110 std::vector<FeatureType> feature_types_;
111 int num_trees_;
112 int num_observations_;
113 int num_features_;
114 bool initialized_{false};
115
116 void UpdatePredictionsInternal(TreeEnsemble* ensemble, Eigen::MatrixXd& covariates, Eigen::MatrixXd& basis);
117 void UpdatePredictionsInternal(TreeEnsemble* ensemble, Eigen::MatrixXd& covariates);
118 void UpdateSampleTrackersInternal(TreeEnsemble& forest, Eigen::MatrixXd& covariates, Eigen::MatrixXd& basis);
119 void UpdateSampleTrackersInternal(TreeEnsemble& forest, Eigen::MatrixXd& covariates);
120 void UpdateSampleTrackersResidualInternalBasis(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
121 void UpdateSampleTrackersResidualInternalNoBasis(TreeEnsemble& forest, ForestDataset& dataset, ColumnVector& residual, bool is_mean_model);
122};
123
126 public:
127 SamplePredMapper(int num_trees, data_size_t num_observations) {
128 num_trees_ = num_trees;
129 num_observations_ = num_observations;
130 // Initialize the vector of vectors of leaf indices for each tree
131 tree_preds_.resize(num_trees_);
132 for (int j = 0; j < num_trees_; j++) {
133 tree_preds_[j].resize(num_observations_);
134 }
135 }
136
137 inline double GetPred(data_size_t sample_id, int tree_id) {
138 CHECK_LT(sample_id, num_observations_);
139 CHECK_LT(tree_id, num_trees_);
140 return tree_preds_[tree_id][sample_id];
141 }
142
143 inline void SetPred(data_size_t sample_id, int tree_id, double value) {
144 CHECK_LT(sample_id, num_observations_);
145 CHECK_LT(tree_id, num_trees_);
146 tree_preds_[tree_id][sample_id] = value;
147 }
148
149 inline int NumTrees() {return num_trees_;}
150
151 inline int NumObservations() {return num_observations_;}
152
153 inline void AssignAllSamplesToConstantPrediction(int tree_id, double value) {
154 for (data_size_t i = 0; i < num_observations_; i++) {
155 tree_preds_[tree_id][i] = value;
156 }
157 }
158
159 private:
160 std::vector<std::vector<double>> tree_preds_;
161 int num_trees_;
162 data_size_t num_observations_;
163};
164
167 public:
168 SampleNodeMapper(int num_trees, data_size_t num_observations) {
169 num_trees_ = num_trees;
170 num_observations_ = num_observations;
171 // Initialize the vector of vectors of leaf indices for each tree
172 tree_observation_indices_.resize(num_trees_);
173 for (int j = 0; j < num_trees_; j++) {
174 tree_observation_indices_[j].resize(num_observations_);
175 }
176 }
177
179 num_trees_ = other.NumTrees();
180 num_observations_ = other.NumObservations();
181 // Initialize the vector of vectors of leaf indices for each tree
182 tree_observation_indices_.resize(num_trees_);
183 for (int j = 0; j < num_trees_; j++) {
184 tree_observation_indices_[j].resize(num_observations_);
185 for (int i = 0; i < num_observations_; i++) {
186 tree_observation_indices_[j][i] = other.GetNodeId(i, j);
187 }
188 }
189 }
190
191 void AddSplit(Eigen::MatrixXd& covariates, TreeSplit& split, int32_t split_feature, int32_t tree_id, int32_t split_node_id, int32_t left_node_id, int32_t right_node_id) {
192 CHECK_EQ(num_observations_, covariates.rows());
193 // Eigen::MatrixXd X = covariates.GetData();
194 for (int i = 0; i < num_observations_; i++) {
195 if (tree_observation_indices_[tree_id][i] == split_node_id) {
196 auto fvalue = covariates(i, split_feature);
197 if (split.SplitTrue(fvalue)) {
198 tree_observation_indices_[tree_id][i] = left_node_id;
199 } else {
200 tree_observation_indices_[tree_id][i] = right_node_id;
201 }
202 }
203 }
204 }
205
206 inline data_size_t GetNodeId(data_size_t sample_id, int tree_id) {
207 CHECK_LT(sample_id, num_observations_);
208 CHECK_LT(tree_id, num_trees_);
209 return tree_observation_indices_[tree_id][sample_id];
210 }
211
212 inline void SetNodeId(data_size_t sample_id, int tree_id, int node_id) {
213 CHECK_LT(sample_id, num_observations_);
214 CHECK_LT(tree_id, num_trees_);
215 tree_observation_indices_[tree_id][sample_id] = node_id;
216 }
217
218 inline int NumTrees() {return num_trees_;}
219
220 inline int NumObservations() {return num_observations_;}
221
222 inline void AssignAllSamplesToRoot(int tree_id) {
223 for (data_size_t i = 0; i < num_observations_; i++) {
224 tree_observation_indices_[tree_id][i] = 0;
225 }
226 }
227
228 private:
229 std::vector<std::vector<int>> tree_observation_indices_;
230 int num_trees_;
231 data_size_t num_observations_;
232};
233
236 public:
237 FeatureUnsortedPartition(data_size_t n);
238
241
243 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit& split);
244
246 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value);
247
249 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector<std::uint32_t> const& category_list);
250
252 void PruneNodeToLeaf(int node_id);
253
255 bool IsLeaf(int node_id);
256
258 bool IsValidNode(int node_id);
259
261 bool LeftNodeIsLeaf(int node_id);
262
264 bool RightNodeIsLeaf(int node_id);
265
267 data_size_t NodeBegin(int node_id);
268
270 data_size_t NodeEnd(int node_id);
271
273 data_size_t NodeSize(int node_id);
274
276 int Parent(int node_id);
277
279 int LeftNode(int node_id);
280
282 int RightNode(int node_id);
283
285 std::vector<data_size_t> indices_;
286
288 std::vector<data_size_t> NodeIndices(int node_id);
289
291 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper);
292
293 private:
294 // Vectors tracking indices in each node
295 std::vector<data_size_t> node_begin_;
296 std::vector<data_size_t> node_length_;
297 std::vector<int32_t> parent_nodes_;
298 std::vector<int32_t> left_nodes_;
299 std::vector<int32_t> right_nodes_;
300 int num_nodes_, num_deleted_nodes_;
301 std::vector<int> deleted_nodes_;
302
303 // Private helper functions
304 void ExpandNodeTrackingVectors(int node_id, int left_node_id, int right_node_id, data_size_t node_start_idx, data_size_t num_left, data_size_t num_right);
305 void ConvertLeafParentToLeaf(int node_id);
306};
307
310 public:
311 UnsortedNodeSampleTracker(data_size_t n, int num_trees) {
312 feature_partitions_.resize(num_trees);
313 num_trees_ = num_trees;
314 for (int i = 0; i < num_trees; i++) {
315 feature_partitions_[i].reset(new FeatureUnsortedPartition(n));
316 }
317 }
318
321
323 void PartitionTreeNode(Eigen::MatrixXd& covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit& split) {
324 return feature_partitions_[tree_id]->PartitionNode(covariates, node_id, left_node_id, right_node_id, feature_split, split);
325 }
326
328 void PartitionTreeNode(Eigen::MatrixXd& covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value) {
329 return feature_partitions_[tree_id]->PartitionNode(covariates, node_id, left_node_id, right_node_id, feature_split, split_value);
330 }
331
333 void PartitionTreeNode(Eigen::MatrixXd& covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector<std::uint32_t> const& category_list) {
334 return feature_partitions_[tree_id]->PartitionNode(covariates, node_id, left_node_id, right_node_id, feature_split, category_list);
335 }
336
338 void ResetTreeToRoot(int tree_id, data_size_t n) {
339 feature_partitions_[tree_id].reset(new FeatureUnsortedPartition(n));;
340 }
341
343 void PruneTreeNodeToLeaf(int tree_id, int node_id) {
344 return feature_partitions_[tree_id]->PruneNodeToLeaf(node_id);
345 }
346
348 bool IsLeaf(int tree_id, int node_id) {
349 return feature_partitions_[tree_id]->IsLeaf(node_id);
350 }
351
353 bool IsValidNode(int tree_id, int node_id) {
354 return feature_partitions_[tree_id]->IsValidNode(node_id);
355 }
356
358 bool LeftNodeIsLeaf(int tree_id, int node_id) {
359 return feature_partitions_[tree_id]->LeftNodeIsLeaf(node_id);
360 }
361
363 bool RightNodeIsLeaf(int tree_id, int node_id) {
364 return feature_partitions_[tree_id]->RightNodeIsLeaf(node_id);
365 }
366
368 data_size_t NodeBegin(int tree_id, int node_id) {
369 return feature_partitions_[tree_id]->NodeBegin(node_id);
370 }
371
373 data_size_t NodeEnd(int tree_id, int node_id) {
374 return feature_partitions_[tree_id]->NodeEnd(node_id);
375 }
376
377 std::vector<data_size_t>::iterator NodeBeginIterator(int tree_id, int node_id) {
378 data_size_t node_begin = feature_partitions_[tree_id]->NodeBegin(node_id);
379 auto begin_iter = feature_partitions_[tree_id]->indices_.begin();
380 return begin_iter + node_begin;
381 }
382
383 std::vector<data_size_t>::iterator NodeEndIterator(int tree_id, int node_id) {
384 int node_end = feature_partitions_[tree_id]->NodeEnd(node_id);
385 auto begin_iter = feature_partitions_[tree_id]->indices_.begin();
386 return begin_iter + node_end;
387 }
388
390 data_size_t NodeSize(int tree_id, int node_id) {
391 return feature_partitions_[tree_id]->NodeSize(node_id);
392 }
393
395 int Parent(int tree_id, int node_id) {
396 return feature_partitions_[tree_id]->Parent(node_id);
397 }
398
400 int LeftNode(int tree_id, int node_id) {
401 return feature_partitions_[tree_id]->LeftNode(node_id);
402 }
403
405 int RightNode(int tree_id, int node_id) {
406 return feature_partitions_[tree_id]->RightNode(node_id);
407 }
408
410 std::vector<data_size_t> TreeNodeIndices(int tree_id, int node_id) {
411 return feature_partitions_[tree_id]->NodeIndices(node_id);
412 }
413
415 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper) {
416 feature_partitions_[tree_id]->UpdateObservationMapping(node_id, tree_id, sample_node_mapper);
417 }
418
420 void UpdateObservationMapping(Tree* tree, int tree_id, SampleNodeMapper* sample_node_mapper) {
421 std::vector<int> leaves = tree->GetLeaves();
422 int leaf;
423 for (int i = 0; i < leaves.size(); i++) {
424 leaf = leaves[i];
425 UpdateObservationMapping(leaf, tree_id, sample_node_mapper);
426 }
427 }
428
430 int NumTrees() { return num_trees_; }
431
433 FeatureUnsortedPartition* GetFeaturePartition(int i) { return feature_partitions_[i].get(); }
434
435 private:
436 // Vectors of feature partitions
437 std::vector<std::unique_ptr<FeatureUnsortedPartition>> feature_partitions_;
438 int num_trees_;
439};
440
443 public:
444 NodeOffsetSize(data_size_t node_offset, data_size_t node_size) : node_begin_{node_offset}, node_size_{node_size}, presorted_{false} {
445 node_end_ = node_begin_ + node_size_;
446 }
447
448 ~NodeOffsetSize() {}
449
450 void SetSorted() {presorted_ = true;}
451
452 bool IsSorted() {return presorted_;}
453
454 data_size_t Begin() {return node_begin_;}
455
456 data_size_t End() {return node_end_;}
457
458 data_size_t Size() {return node_size_;}
459
460 private:
461 data_size_t node_begin_;
462 data_size_t node_size_;
463 data_size_t node_end_;
464 bool presorted_;
465};
466
469
481 public:
482 FeaturePresortRoot(Eigen::MatrixXd& covariates, int32_t feature_index, FeatureType feature_type) {
483 feature_index_ = feature_index;
484 ArgsortRoot(covariates);
485 }
486
488
489 void ArgsortRoot(Eigen::MatrixXd& covariates) {
490 data_size_t num_obs = covariates.rows();
491
492 // Make a vector of indices from 0 to num_obs - 1
493 if (feature_sort_indices_.size() != num_obs){
494 feature_sort_indices_.resize(num_obs, 0);
495 }
496 std::iota(feature_sort_indices_.begin(), feature_sort_indices_.end(), 0);
497
498 // Define a custom comparator to be used with stable_sort:
499 // For every two indices l and r store as elements of `data_sort_indices_`,
500 // compare them for sorting purposes by indexing the covariate's raw data with both l and r
501 auto comp_op = [&](size_t const &l, size_t const &r) { return std::less<double>{}(covariates(l, feature_index_), covariates(r, feature_index_)); };
502 std::stable_sort(feature_sort_indices_.begin(), feature_sort_indices_.end(), comp_op);
503 }
504
505 private:
506 std::vector<data_size_t> feature_sort_indices_;
507 int32_t feature_index_;
508};
509
512 public:
513 FeaturePresortRootContainer(Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types) {
514 num_features_ = covariates.cols();
515 feature_presort_.resize(num_features_);
516 for (int i = 0; i < num_features_; i++) {
517 feature_presort_[i].reset(new FeaturePresortRoot(covariates, i, feature_types[i]));
518 }
519 }
520
522
523 FeaturePresortRoot* GetFeaturePresort(int feature_num) {return feature_presort_[feature_num].get(); }
524
525 private:
526 std::vector<std::unique_ptr<FeaturePresortRoot>> feature_presort_;
527 int num_features_;
528};
529
541 public:
542 FeaturePresortPartition(FeaturePresortRoot* feature_presort_root, Eigen::MatrixXd& covariates, int32_t feature_index, FeatureType feature_type) {
543 // Unpack all feature details
544 feature_index_ = feature_index;
545 feature_type_ = feature_type;
546 num_obs_ = covariates.rows();
547 feature_sort_indices_ = feature_presort_root->feature_sort_indices_;
548
549 // Initialize new tree to root
550 data_size_t node_offset = 0;
551 node_offset_sizes_.emplace_back(node_offset, num_obs_);
552 }
553
555
557 void SplitFeature(Eigen::MatrixXd& covariates, int32_t node_id, int32_t feature_index, TreeSplit& split);
558
560 void SplitFeatureNumeric(Eigen::MatrixXd& covariates, int32_t node_id, int32_t feature_index, double split_value);
561
563 void SplitFeatureCategorical(Eigen::MatrixXd& covariates, int32_t node_id, int32_t feature_index, std::vector<std::uint32_t> const& category_list);
564
566 data_size_t NodeBegin(int32_t node_id) {return node_offset_sizes_[node_id].Begin();}
567
569 data_size_t NodeEnd(int32_t node_id) {return node_offset_sizes_[node_id].End();}
570
572 data_size_t NodeSize(int32_t node_id) {return node_offset_sizes_[node_id].Size();}
573
575 std::vector<data_size_t> NodeIndices(int node_id);
576
578 data_size_t SortIndex(data_size_t j) {return feature_sort_indices_[j];}
579
581 FeatureType GetFeatureType() {return feature_type_;}
582
584 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper);
585
587 std::vector<data_size_t> feature_sort_indices_;
588 private:
590 void AddLeftRightNodes(data_size_t left_node_begin, data_size_t left_node_size, data_size_t right_node_begin, data_size_t right_node_size);
591
593 std::vector<NodeOffsetSize> node_offset_sizes_;
594 int32_t feature_index_;
595 FeatureType feature_type_;
596 data_size_t num_obs_;
597};
598
601 public:
602 SortedNodeSampleTracker(FeaturePresortRootContainer* feature_presort_root_container, Eigen::MatrixXd& covariates, std::vector<FeatureType>& feature_types) {
603 num_features_ = covariates.cols();
604 feature_partitions_.resize(num_features_);
605 FeaturePresortRoot* feature_presort_root;
606 for (int i = 0; i < num_features_; i++) {
607 feature_presort_root = feature_presort_root_container->GetFeaturePresort(i);
608 feature_partitions_[i].reset(new FeaturePresortPartition(feature_presort_root, covariates, i, feature_types[i]));
609 }
610 }
611
613 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int feature_split, TreeSplit& split, int num_threads = -1) {
614 StochTree::ParallelFor(0, num_features_, num_threads, [&](int i) {
615 feature_partitions_[i]->SplitFeature(covariates, node_id, feature_split, split);
616 });
617 }
618
620 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int feature_split, double split_value, int num_threads = -1) {
621 StochTree::ParallelFor(0, num_features_, num_threads, [&](int i) {
622 feature_partitions_[i]->SplitFeatureNumeric(covariates, node_id, feature_split, split_value);
623 });
624 }
625
627 void PartitionNode(Eigen::MatrixXd& covariates, int node_id, int feature_split, std::vector<std::uint32_t> const& category_list, int num_threads = -1) {
628 StochTree::ParallelFor(0, num_features_, num_threads, [&](int i) {
629 feature_partitions_[i]->SplitFeatureCategorical(covariates, node_id, feature_split, category_list);
630 });
631 }
632
634 data_size_t NodeBegin(int node_id, int feature_index) {
635 return feature_partitions_[feature_index]->NodeBegin(node_id);
636 }
637
639 data_size_t NodeEnd(int node_id, int feature_index) {
640 return feature_partitions_[feature_index]->NodeEnd(node_id);
641 }
642
644 data_size_t NodeSize(int node_id, int feature_index) {
645 return feature_partitions_[feature_index]->NodeSize(node_id);
646 }
647
648 std::vector<data_size_t>::iterator NodeBeginIterator(int node_id, int feature_index) {
649 data_size_t node_begin = NodeBegin(node_id, feature_index);
650 auto begin_iter = feature_partitions_[feature_index]->feature_sort_indices_.begin();
651 return begin_iter + node_begin;
652 }
653
654 std::vector<data_size_t>::iterator NodeEndIterator(int node_id, int feature_index) {
655 data_size_t node_end = NodeEnd(node_id, feature_index);
656 auto begin_iter = feature_partitions_[feature_index]->feature_sort_indices_.begin();
657 return begin_iter + node_end;
658 }
659
661 std::vector<data_size_t> NodeIndices(int node_id, int feature_index) {
662 return feature_partitions_[feature_index]->NodeIndices(node_id);
663 }
664
666 data_size_t SortIndex(data_size_t j, int feature_index) {return feature_partitions_[feature_index]->SortIndex(j); }
667
669 void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper* sample_node_mapper, int feature_index = 0) {
670 feature_partitions_[feature_index]->UpdateObservationMapping(node_id, tree_id, sample_node_mapper);
671 }
672
673 private:
674 std::vector<std::unique_ptr<FeaturePresortPartition>> feature_partitions_;
675 int num_features_;
676};
677
678} // namespace StochTree
679
680#endif // STOCHTREE_PARTITION_TRACKER_H_
Internal wrapper around Eigen::VectorXd interface for univariate floating point data....
Definition data.h:193
Data structure that tracks pre-sorted feature values through a tree's split lifecycle.
Definition partition_tracker.h:540
void SplitFeature(Eigen::MatrixXd &covariates, int32_t node_id, int32_t feature_index, TreeSplit &split)
Split numeric / ordered categorical feature and update sort indices.
data_size_t NodeEnd(int32_t node_id)
End position of node indexed by node_id.
Definition partition_tracker.h:569
std::vector< data_size_t > NodeIndices(int node_id)
Data indices for a given node.
std::vector< data_size_t > feature_sort_indices_
Feature sort indices.
Definition partition_tracker.h:587
data_size_t NodeSize(int32_t node_id)
Size (in observations) of node indexed by node_id.
Definition partition_tracker.h:572
FeatureType GetFeatureType()
Feature type.
Definition partition_tracker.h:581
data_size_t SortIndex(data_size_t j)
Feature sort index j.
Definition partition_tracker.h:578
data_size_t NodeBegin(int32_t node_id)
Start position of node indexed by node_id.
Definition partition_tracker.h:566
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in node_id.
void SplitFeatureNumeric(Eigen::MatrixXd &covariates, int32_t node_id, int32_t feature_index, double split_value)
Split numeric / ordered categorical feature and update sort indices.
void SplitFeatureCategorical(Eigen::MatrixXd &covariates, int32_t node_id, int32_t feature_index, std::vector< std::uint32_t > const &category_list)
Split unordered categorical feature and update sort indices.
Container class for FeaturePresortRoot objects stored for every feature in a dataset.
Definition partition_tracker.h:511
Data structure for presorting a feature by its values.
Definition partition_tracker.h:479
Mapping nodes to the indices they contain.
Definition partition_tracker.h:235
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value)
Partition a node based on a new split rule.
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit &split)
Partition a node based on a new split rule.
int RightNode(int node_id)
Right child of node_id.
int Parent(int node_id)
Parent node_id.
data_size_t NodeEnd(int node_id)
One past the last index of data points contained in node_id.
std::vector< data_size_t > indices_
Data indices.
Definition partition_tracker.h:285
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector< std::uint32_t > const &category_list)
Partition a node based on a new split rule.
bool RightNodeIsLeaf(int node_id)
Whether node_id's right child is a leaf.
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in node_id.
data_size_t NodeSize(int node_id)
Number of data points contained in node_id.
std::vector< data_size_t > NodeIndices(int node_id)
Data indices for a given node.
void ReconstituteFromTree(Tree &tree, ForestDataset &dataset)
Reconstitute a tree partition tracker from root based on a tree.
bool IsLeaf(int node_id)
Whether node_id is a leaf.
void PruneNodeToLeaf(int node_id)
Convert a (currently split) node to a leaf.
bool LeftNodeIsLeaf(int node_id)
Whether node_id's left child is a leaf.
int LeftNode(int node_id)
Left child of node_id.
bool IsValidNode(int node_id)
Whether node_id is a valid node.
data_size_t NodeBegin(int node_id)
First index of data points contained in node_id.
API for loading and accessing data used to sample tree ensembles The covariates / bases / weights use...
Definition data.h:271
"Superclass" wrapper around tracking data structures for forest sampling algorithms
Definition partition_tracker.h:46
ForestTracker(Eigen::MatrixXd &covariates, std::vector< FeatureType > &feature_types, int num_trees, int num_observations)
Construct a new ForestTracker object.
Tracking cutpoints available at a given node.
Definition partition_tracker.h:442
Class storing sample-node map for each tree in an ensemble.
Definition partition_tracker.h:166
Class storing sample-prediction map for each tree in an ensemble.
Definition partition_tracker.h:125
Data structure for tracking observations through a tree partition with each feature pre-sorted.
Definition partition_tracker.h:600
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int feature_split, TreeSplit &split, int num_threads=-1)
Partition a node based on a new split rule.
Definition partition_tracker.h:613
data_size_t NodeSize(int node_id, int feature_index)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:644
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper, int feature_index=0)
Update SampleNodeMapper for all the observations in node_id.
Definition partition_tracker.h:669
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int feature_split, double split_value, int num_threads=-1)
Partition a node based on a new split rule.
Definition partition_tracker.h:620
std::vector< data_size_t > NodeIndices(int node_id, int feature_index)
Data indices for a given node.
Definition partition_tracker.h:661
data_size_t SortIndex(data_size_t j, int feature_index)
Feature sort index j for feature_index.
Definition partition_tracker.h:666
data_size_t NodeBegin(int node_id, int feature_index)
First index of data points contained in node_id.
Definition partition_tracker.h:634
void PartitionNode(Eigen::MatrixXd &covariates, int node_id, int feature_split, std::vector< std::uint32_t > const &category_list, int num_threads=-1)
Partition a node based on a new split rule.
Definition partition_tracker.h:627
data_size_t NodeEnd(int node_id, int feature_index)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:639
Class storing a "forest," or an ensemble of decision trees.
Definition ensemble.h:31
Representation of arbitrary tree split rules, including numeric split rules (X[,i] <= c) and categori...
Definition tree.h:958
bool SplitTrue(double fvalue)
Whether a given covariate value is True or False on the rule defined by a TreeSplit object.
Definition tree.h:990
Decision tree data structure.
Definition tree.h:66
std::vector< std::int32_t > const & GetLeaves() const
Get indices of all leaf nodes.
Definition tree.h:564
Mapping nodes to the indices they contain.
Definition partition_tracker.h:309
void PartitionTreeNode(Eigen::MatrixXd &covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, std::vector< std::uint32_t > const &category_list)
Partition a node based on a new split rule.
Definition partition_tracker.h:333
void UpdateObservationMapping(Tree *tree, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in tree.
Definition partition_tracker.h:420
int RightNode(int tree_id, int node_id)
Right child of node_id.
Definition partition_tracker.h:405
bool IsLeaf(int tree_id, int node_id)
Whether node_id is a leaf.
Definition partition_tracker.h:348
bool RightNodeIsLeaf(int tree_id, int node_id)
Whether node_id's right child is a leaf.
Definition partition_tracker.h:363
bool IsValidNode(int tree_id, int node_id)
Whether node_id is a valid node.
Definition partition_tracker.h:353
void UpdateObservationMapping(int node_id, int tree_id, SampleNodeMapper *sample_node_mapper)
Update SampleNodeMapper for all the observations in node_id.
Definition partition_tracker.h:415
data_size_t NodeBegin(int tree_id, int node_id)
First index of data points contained in node_id.
Definition partition_tracker.h:368
int NumTrees()
Number of trees.
Definition partition_tracker.h:430
int LeftNode(int tree_id, int node_id)
Left child of node_id.
Definition partition_tracker.h:400
void ResetTreeToRoot(int tree_id, data_size_t n)
Convert a tree to root.
Definition partition_tracker.h:338
void PartitionTreeNode(Eigen::MatrixXd &covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, TreeSplit &split)
Partition a node based on a new split rule.
Definition partition_tracker.h:323
data_size_t NodeSize(int tree_id, int node_id)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:390
bool LeftNodeIsLeaf(int tree_id, int node_id)
Whether node_id's left child is a leaf.
Definition partition_tracker.h:358
int Parent(int tree_id, int node_id)
Parent node_id.
Definition partition_tracker.h:395
void ReconstituteFromForest(TreeEnsemble &forest, ForestDataset &dataset)
Reconstruct the node sample tracker based on the splits in a forest.
data_size_t NodeEnd(int tree_id, int node_id)
One past the last index of data points contained in node_id.
Definition partition_tracker.h:373
void PartitionTreeNode(Eigen::MatrixXd &covariates, int tree_id, int node_id, int left_node_id, int right_node_id, int feature_split, double split_value)
Partition a node based on a new split rule.
Definition partition_tracker.h:328
FeatureUnsortedPartition * GetFeaturePartition(int i)
Return a pointer to the feature partition tracking tree i.
Definition partition_tracker.h:433
std::vector< data_size_t > TreeNodeIndices(int tree_id, int node_id)
Data indices for a given node.
Definition partition_tracker.h:410
void PruneTreeNodeToLeaf(int tree_id, int node_id)
Convert a (currently split) node to a leaf.
Definition partition_tracker.h:343
A collection of random number generation utilities.
Definition category_tracker.h:36