StochTree 0.0.1
|
Decision tree data structure. More...
#include <tree.h>
Public Member Functions | |
void | CloneFromTree (Tree *tree) |
Copy the structure and parameters of another tree. If the Tree object calling this method already has a non-root tree structure / parameters, this will be erased and replaced with a copy of tree . | |
void | Reset () |
Reset tree to empty vectors and default values of boolean / integer variables. | |
void | Init (int output_dimension=1, bool is_log_scale=false) |
Initialize the tree with a single root node. | |
int | AllocNode () |
Allocate a new node and return the node's ID. | |
void | DeleteNode (std::int32_t nid) |
Deletes node indexed by node ID. | |
void | ExpandNode (std::int32_t nid, int split_index, double split_value, double left_value, double right_value) |
Expand a node based on a numeric split rule. | |
void | ExpandNode (std::int32_t nid, int split_index, std::vector< std::uint32_t > const &categorical_indices, double left_value, double right_value) |
Expand a node based on a categorical split rule. | |
void | ExpandNode (std::int32_t nid, int split_index, double split_value, std::vector< double > left_value_vector, std::vector< double > right_value_vector) |
Expand a node based on a numeric split rule. | |
void | ExpandNode (std::int32_t nid, int split_index, std::vector< std::uint32_t > const &categorical_indices, std::vector< double > left_value_vector, std::vector< double > right_value_vector) |
Expand a node based on a categorical split rule. | |
void | ExpandNode (std::int32_t nid, int split_index, TreeSplit &split, double left_value, double right_value) |
Expand a node based on a generic split rule. | |
void | ExpandNode (std::int32_t nid, int split_index, TreeSplit &split, std::vector< double > left_value_vector, std::vector< double > right_value_vector) |
Expand a node based on a generic split rule. | |
bool | IsRoot () |
Whether or not a tree is a "stump" consisting of a single root node. | |
json | to_json () |
Convert tree to JSON and return JSON in-memory. | |
void | from_json (const json &tree_json) |
Load from JSON. | |
void | CollapseToLeaf (std::int32_t nid, double value) |
Collapse an internal node to a leaf node, deleting its children from the tree. | |
void | CollapseToLeaf (std::int32_t nid, std::vector< double > value_vector) |
Collapse an internal node to a leaf node, deleting its children from the tree. | |
template<typename Func > | |
void | WalkTree (Func func) const |
Iterate through all nodes in this tree. | |
bool | HasVectorOutput () const |
Whether or not a tree has vector output. | |
std::int32_t | OutputDimension () const |
Dimension of tree output. | |
bool | IsLogScale () const |
Whether or not tree parameters should be exponentiated at prediction time. | |
std::int32_t | Parent (std::int32_t nid) const |
Index of the node's parent. | |
std::int32_t | LeftChild (std::int32_t nid) const |
Index of the node's left child. | |
std::int32_t | RightChild (std::int32_t nid) const |
Index of the node's right child. | |
std::int32_t | DefaultChild (std::int32_t nid) const |
Index of the node's "default" child (potentially used in the case of a missing feature at prediction time) | |
std::int32_t | SplitIndex (std::int32_t nid) const |
Feature index defining the node's split rule. | |
bool | IsLeaf (std::int32_t nid) const |
Whether the node is a leaf node. | |
bool | IsRoot (std::int32_t nid) const |
Whether the node is root. | |
bool | IsDeleted (std::int32_t nid) const |
Whether the node has been deleted. | |
double | LeafValue (std::int32_t nid) const |
Get parameter value of a node (typically though not necessarily a leaf node) | |
double | LeafValue (std::int32_t nid, std::int32_t dim_id) const |
Get parameter value of a node (typically though not necessarily a leaf node) at a given output dimension. | |
std::int32_t | MaxLeafDepth () const |
Get maximum depth of all of the leaf nodes. | |
std::vector< double > | LeafVector (std::int32_t nid) const |
Get vector-valued parameters of a node (typically leaf) | |
double | SumSquaredNodeValues (std::int32_t nid) const |
Sum of squared parameter values for a given node (typically though not necessarily a leaf node) | |
double | SumSquaredLeafValues () const |
Sum of squared values for all leaves in a tree. | |
bool | HasLeafVector (std::int32_t nid) const |
Tests whether the leaf node has a non-empty leaf vector. | |
double | Threshold (std::int32_t nid) const |
Get split threshold of the node. | |
std::vector< std::uint32_t > | CategoryList (std::int32_t nid) const |
Get list of all categories belonging to the left child node. Categories are integers ranging from 0 to (n-1), where n is the number of categories in that particular feature. This list is assumed to be in ascending order. | |
TreeNodeType | NodeType (std::int32_t nid) const |
Get the type of a node (i.e. numeric split, categorical split, leaf) | |
bool | IsNumericSplitNode (std::int32_t nid) const |
Whether the node is a numeric split node. | |
bool | IsCategoricalSplitNode (std::int32_t nid) const |
Whether the node is a numeric split node. | |
bool | HasCategoricalSplit () const |
Query whether this tree contains any categorical splits. | |
std::vector< std::int32_t > const & | GetInternalNodes () const |
Get indices of all internal nodes. | |
std::vector< std::int32_t > const & | GetLeaves () const |
Get indices of all leaf nodes. | |
std::vector< std::int32_t > const & | GetLeafParents () const |
Get indices of all leaf parent nodes. | |
std::vector< std::int32_t > | GetNodes () |
Get indices of all valid (non-deleted) nodes. | |
std::int32_t | GetDepth (std::int32_t nid) const |
Get the depth of a node. | |
std::int32_t | NumNodes () const noexcept |
Get the total number of nodes including deleted ones in this tree. | |
std::int32_t | NumDeletedNodes () const noexcept |
Get the total number of deleted nodes in this tree. | |
std::int32_t | NumValidNodes () const noexcept |
Get the total number of valid nodes in this tree. | |
void | SetLeftChild (std::int32_t nid, std::int32_t left_child) |
Identify left child node. | |
void | SetRightChild (std::int32_t nid, std::int32_t right_child) |
Identify right child node. | |
void | SetChildren (std::int32_t nid, std::int32_t left_child, std::int32_t right_child) |
Identify two child nodes of the node and the corresponding parent node of the child nodes. | |
void | SetParent (std::int32_t child_node, std::int32_t parent_node) |
Identify parent node. | |
void | SetParents (std::int32_t nid, std::int32_t left_child, std::int32_t right_child) |
Identify parent node of the left and right node ids. | |
void | SetNumericSplit (std::int32_t nid, std::int32_t split_index, double threshold) |
Create a numerical split. | |
void | SetCategoricalSplit (std::int32_t nid, std::int32_t split_index, std::vector< std::uint32_t > const &category_list) |
Create a categorical split. | |
void | SetLeaf (std::int32_t nid, double value) |
Set the leaf value of the node. | |
void | SetLeafVector (std::int32_t nid, std::vector< double > const &leaf_vector) |
Set the leaf vector of the node; useful for multi-output trees. | |
void | PredictLeafIndexInplace (ForestDataset *dataset, std::vector< int32_t > &output, int32_t offset, int32_t max_leaf) |
Obtain a 0-based leaf index for each observation in a ForestDataset. Internally, trees are stored as vectors of node information, and the leaves_ vector gives us node IDs for every leaf in the tree. Here, we would like to know, for every observation in a dataset, which leaf number it is mapped to. Since the leaf numbers themselves do not carry any information, we renumber them from 0 to leaves_.size()-1 . | |
void | PredictLeafIndexInplace (Eigen::MatrixXd &covariates, std::vector< int32_t > &output, int32_t offset, int32_t max_leaf) |
Obtain a 0-based leaf index for each observation in a ForestDataset. Internally, trees are stored as vectors of node information, and the leaves_ vector gives us node IDs for every leaf in the tree. Here, we would like to know, for every observation in a dataset, which leaf number it is mapped to. Since the leaf numbers themselves do not carry any information, we renumber them from 0 to leaves_.size()-1 . | |
void | PredictLeafIndexInplace (Eigen::Map< Eigen::Matrix< double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor > > &covariates, std::vector< int32_t > &output, int32_t offset, int32_t max_leaf) |
Obtain a 0-based leaf index for each observation in a ForestDataset. Internally, trees are stored as vectors of node information, and the leaves_ vector gives us node IDs for every leaf in the tree. Here, we would like to know, for every observation in a dataset, which leaf number it is mapped to. Since the leaf numbers themselves do not carry any information, we renumber them from 0 to leaves_.size()-1 . | |
Decision tree data structure.
void StochTree::Tree::CloneFromTree | ( | Tree * | tree | ) |
void StochTree::Tree::from_json | ( | const json & | tree_json | ) |
Load from JSON.
tree_json | In-memory json object (of type nlohmann::json ) |
|
inline |
Collapse an internal node to a leaf node, deleting its children from the tree.
nid | Node id of the new leaf node |
value_vector | New leaf value |
|
inline |
Collapse an internal node to a leaf node, deleting its children from the tree.
nid | Node id of the new leaf node |
value_vector | New leaf vector value |
|
inline |
Iterate through all nodes in this tree.
Func | Function object type, must map std::int32_t to bool . |
func | Function that accepts a node index and returns False when iteration through a given branch of the tree should stop and True otherwise. |
|
inline |
Whether or not a tree has vector output.
Getters
|
inline |
Index of the node's parent.
nid | ID of node being queried |
|
inline |
Index of the node's left child.
nid | ID of node being queried |
|
inline |
Index of the node's right child.
nid | ID of node being queried |
|
inline |
Index of the node's "default" child (potentially used in the case of a missing feature at prediction time)
nid | ID of node being queried |
|
inline |
Feature index defining the node's split rule.
nid | ID of node being queried |
|
inline |
Whether the node is a leaf node.
nid | ID of node being queried |
|
inline |
Whether the node is root.
nid | ID of node being queried |
|
inline |
Whether the node has been deleted.
nid | ID of node being queried |
|
inline |
Get parameter value of a node (typically though not necessarily a leaf node)
nid | ID of node being queried |
|
inline |
Get parameter value of a node (typically though not necessarily a leaf node) at a given output dimension.
nid | ID of node being queried |
dim_id | Output dimension being queried |
|
inline |
Get vector-valued parameters of a node (typically leaf)
nid | ID of node being queried |
|
inline |
Sum of squared parameter values for a given node (typically though not necessarily a leaf node)
nid | ID of node being queried |
|
inline |
Tests whether the leaf node has a non-empty leaf vector.
nid | ID of node being queried |
|
inline |
Get split threshold of the node.
nid | ID of node being queried |
|
inline |
Get list of all categories belonging to the left child node. Categories are integers ranging from 0 to (n-1), where n is the number of categories in that particular feature. This list is assumed to be in ascending order.
nid | ID of node being queried |
|
inline |
Get the type of a node (i.e. numeric split, categorical split, leaf)
nid | ID of node being queried |
|
inline |
Whether the node is a numeric split node.
nid | ID of node being queried |
|
inline |
Whether the node is a numeric split node.
nid | ID of node being queried |
|
inline |
Get the depth of a node.
nid | node id |
|
inline |
Identify left child node.
Setters
nid | ID of node being modified |
left_child | ID of the left child node |
|
inline |
Identify right child node.
nid | ID of node being modified |
right_child | ID of the right child node |
|
inline |
Identify two child nodes of the node and the corresponding parent node of the child nodes.
nid | ID of node being modified |
left_child | ID of the left child node |
right_child | ID of the right child node |
|
inline |
Identify parent node.
child_node | ID of child node |
parent_node | ID of the parent node |
|
inline |
Identify parent node of the left and right node ids.
nid | ID of parent node |
left_child | ID of the left child node |
right_child | ID of the right child node |
void StochTree::Tree::SetNumericSplit | ( | std::int32_t | nid, |
std::int32_t | split_index, | ||
double | threshold | ||
) |
Create a numerical split.
nid | ID of node being updated |
split_index | Feature index to split |
threshold | Threshold value |
void StochTree::Tree::SetCategoricalSplit | ( | std::int32_t | nid, |
std::int32_t | split_index, | ||
std::vector< std::uint32_t > const & | category_list | ||
) |
Create a categorical split.
nid | ID of node being updated |
split_index | Feature index to split |
category_list | List of categories to belong to either the right child node or the left child node. Set categories_list_right_child parameter to indicate which node the category list should represent. |
void StochTree::Tree::SetLeaf | ( | std::int32_t | nid, |
double | value | ||
) |
Set the leaf value of the node.
nid | ID of node being updated |
value | Leaf value |
void StochTree::Tree::SetLeafVector | ( | std::int32_t | nid, |
std::vector< double > const & | leaf_vector | ||
) |
Set the leaf vector of the node; useful for multi-output trees.
nid | ID of node being updated |
leaf_vector | Leaf vector |
void StochTree::Tree::PredictLeafIndexInplace | ( | ForestDataset * | dataset, |
std::vector< int32_t > & | output, | ||
int32_t | offset, | ||
int32_t | max_leaf | ||
) |
Obtain a 0-based leaf index for each observation in a ForestDataset. Internally, trees are stored as vectors of node information, and the leaves_
vector gives us node IDs for every leaf in the tree. Here, we would like to know, for every observation in a dataset, which leaf number it is mapped to. Since the leaf numbers themselves do not carry any information, we renumber them from 0
to leaves_.size()-1
.
Note: this is a tree-level helper function for an ensemble-level function. It assumes the creation of:
dataset.NumObservations()
x ensemble.NumTrees()
, stored in "tree-major" orderstd::vector<int32_t> output(dataset->NumObservations())
and set the offset to 0. dataset | Dataset with which to predict leaf indices from the tree |
output | Pre-allocated output vector storing a matrix of column indices, with "rows" corresponding to observations in dataset and "columns" corresponding to trees in an ensemble |
offset | Bookkeeping index that determines where in output vector that column indices should be unpacked |
max_leaf | Largest leaf value mapped so far. (Leaf indices serve as sparse column indices, so it is important that leaf values be unique to each tree.) |
void StochTree::Tree::PredictLeafIndexInplace | ( | Eigen::MatrixXd & | covariates, |
std::vector< int32_t > & | output, | ||
int32_t | offset, | ||
int32_t | max_leaf | ||
) |
Obtain a 0-based leaf index for each observation in a ForestDataset. Internally, trees are stored as vectors of node information, and the leaves_
vector gives us node IDs for every leaf in the tree. Here, we would like to know, for every observation in a dataset, which leaf number it is mapped to. Since the leaf numbers themselves do not carry any information, we renumber them from 0
to leaves_.size()-1
.
Note: this is a tree-level helper function for an ensemble-level function. It assumes the creation of:
dataset.NumObservations()
x ensemble.NumTrees()
, stored in "tree-major" orderstd::vector<int32_t> output(dataset->NumObservations())
and set the offset to 0. covariates | Eigen matrix with which to predict leaf indices |
output | Pre-allocated output vector storing a matrix of column indices, with "rows" corresponding to observations in covariates and "columns" corresponding to trees in an ensemble |
offset | Bookkeeping index that determines where in output vector that column indices should be unpacked |
max_leaf | Largest leaf value mapped so far. (Leaf indices serve as sparse column indices, so it is important that leaf values be unique to each tree.) |
void StochTree::Tree::PredictLeafIndexInplace | ( | Eigen::Map< Eigen::Matrix< double, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor > > & | covariates, |
std::vector< int32_t > & | output, | ||
int32_t | offset, | ||
int32_t | max_leaf | ||
) |
Obtain a 0-based leaf index for each observation in a ForestDataset. Internally, trees are stored as vectors of node information, and the leaves_
vector gives us node IDs for every leaf in the tree. Here, we would like to know, for every observation in a dataset, which leaf number it is mapped to. Since the leaf numbers themselves do not carry any information, we renumber them from 0
to leaves_.size()-1
.
Note: this is a tree-level helper function for an ensemble-level function. It assumes the creation of:
dataset.NumObservations()
x ensemble.NumTrees()
, stored in "tree-major" orderstd::vector<int32_t> output(dataset->NumObservations())
and set the offset to 0. covariates | Eigen matrix with which to predict leaf indices |
output | Pre-allocated output vector storing a matrix of column indices, with "rows" corresponding to observations in covariates and "columns" corresponding to trees in an ensemble |
offset | Bookkeeping index that determines where in output vector that column indices should be unpacked |
max_leaf | Largest leaf value mapped so far. (Leaf indices serve as sparse column indices, so it is important that leaf values be unique to each tree.) |