5#ifndef STOCHTREE_DATA_H_
6#define STOCHTREE_DATA_H_
9#include <stochtree/io.h>
10#include <stochtree/log.h>
11#include <stochtree/meta.h>
36 std::vector<int32_t>& column_indices, Eigen::MatrixXd& data,
37 data_size_t num_rows) {
38 std::vector<std::pair<int, double>> oneline_features;
39 auto& ref_text_data = *text_data;
42 for (data_size_t i = 0; i < num_rows; ++i) {
44 oneline_features.clear();
45 parser->ParseOneLine(ref_text_data[i].c_str(), &oneline_features);
48 ref_text_data[i].clear();
51 int feature_counter = 0;
52 for (
auto& inner_data : oneline_features) {
53 int feature_idx = inner_data.first;
54 column_matched = (std::find(column_indices.begin(), column_indices.end(), feature_idx)
55 != column_indices.end());
57 data(i, feature_counter) = inner_data.second;
78 int32_t column_index, Eigen::VectorXd& data, data_size_t num_rows) {
79 std::vector<std::pair<int, double>> oneline_features;
80 auto& ref_text_data = *text_data;
82 for (data_size_t i = 0; i < num_rows; ++i) {
84 oneline_features.clear();
85 parser->ParseOneLine(ref_text_data[i].c_str(), &oneline_features);
88 ref_text_data[i].clear();
91 for (
auto& inner_data : oneline_features) {
92 int feature_idx = inner_data.first;
93 if (column_index == feature_idx){
94 data(i) = inner_data.second;
102static inline std::vector<std::string> LoadTextDataToMemory(
const char* filename,
int* num_global_data,
bool header) {
103 size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024;
104 TextReader<data_size_t> text_reader(filename, header, file_load_progress_interval_bytes);
106 *num_global_data = text_reader.ReadAllLines();
107 return std::move(text_reader.Lines());
110static inline void FeatureUnpack(std::vector<int32_t>& categorical_variables,
const char* var_id) {
111 std::string var_clean = Common::RemoveQuotationSymbol(Common::Trim(var_id));
113 bool success = Common::AtoiAndCheck(var_clean.c_str(), &out);
115 categorical_variables.push_back(out);
117 Log::Warning(
"Parsed variable index %s cannot be cast to an integer", var_clean.c_str());
121static inline std::vector<int> Str2FeatureVec(
const char* parameters) {
122 std::vector<int> feature_vec;
123 auto args = Common::Split(parameters,
",");
124 for (
auto arg : args) {
125 FeatureUnpack(feature_vec, Common::Trim(arg).c_str());
144 ColumnMatrix(
double* data_ptr, data_size_t num_row,
int num_col,
bool is_row_major);
153 ColumnMatrix(std::string filename, std::string column_index_string,
bool header =
true,
bool precise_float_parser =
false);
161 double GetElement(data_size_t row_num, int32_t col_num) {
return data_(row_num, col_num);}
169 void SetElement(data_size_t row_num, int32_t col_num,
double value) {data_(row_num, col_num) = value;}
178 void LoadData(
double* data_ptr, data_size_t num_row,
int num_col,
bool is_row_major);
180 inline data_size_t
NumRows() {
return data_.rows();}
184 inline Eigen::MatrixXd&
GetData() {
return data_;}
186 Eigen::MatrixXd data_;
212 ColumnVector(std::string filename, int32_t column_index,
bool header =
true,
bool precise_float_parser =
false);
226 void SetElement(data_size_t row,
double value) {data_(row) = value;}
233 void LoadData(
double* data_ptr, data_size_t num_row);
259 inline data_size_t
NumRows() {
return data_.size();}
261 inline Eigen::VectorXd&
GetData() {
return data_;}
263 Eigen::VectorXd data_;
264 void UpdateData(
double* data_ptr, data_size_t num_row, std::function<
double(
double,
double)> op);
285 void AddCovariates(
double* data_ptr, data_size_t num_row,
int num_col,
bool is_row_major) {
286 covariates_ =
ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
287 num_observations_ = num_row;
288 num_covariates_ = num_col;
289 has_covariates_ =
true;
299 void AddBasis(
double* data_ptr, data_size_t num_row,
int num_col,
bool is_row_major) {
300 basis_ =
ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
301 num_basis_ = num_col;
312 has_var_weights_ =
true;
320 void AddCovariatesFromCSV(std::string filename, std::string column_index_string,
bool header =
true,
bool precise_float_parser =
false) {
321 covariates_ =
ColumnMatrix(filename, column_index_string, header, precise_float_parser);
322 num_observations_ = covariates_.
NumRows();
323 num_covariates_ = covariates_.
NumCols();
324 has_covariates_ =
true;
332 void AddBasisFromCSV(std::string filename, std::string column_index_string,
bool header =
true,
bool precise_float_parser =
false) {
333 basis_ =
ColumnMatrix(filename, column_index_string, header, precise_float_parser);
344 var_weights_ =
ColumnVector(filename, column_index, header, precise_float_parser);
345 has_var_weights_ =
true;
405 void UpdateBasis(
double* data_ptr, data_size_t num_row,
int num_col,
bool is_row_major) {
407 CHECK_EQ(num_col, num_basis_);
410 for (data_size_t i = 0; i < num_row; ++i) {
411 for (
int j = 0; j < num_col; ++j) {
414 temp_value =
static_cast<double>(*(data_ptr +
static_cast<data_size_t
>(num_col) * i + j));
417 temp_value =
static_cast<double>(*(data_ptr +
static_cast<data_size_t
>(num_row) * j + i));
431 CHECK(has_var_weights_);
434 for (data_size_t i = 0; i < num_row; ++i) {
435 if (exponentiate) temp_value = std::exp(
static_cast<double>(*(data_ptr + i)));
436 else temp_value =
static_cast<double>(*(data_ptr + i));
448 covariates_.
SetElement(row_id, col, new_value);
469 CHECK(has_var_weights_);
470 if (exponentiate) var_weights_.
SetElement(row_id, std::exp(new_value));
471 else var_weights_.
SetElement(row_id, new_value);
477 data_size_t num_observations_{0};
478 int num_covariates_{0};
480 bool has_covariates_{
false};
481 bool has_basis_{
false};
482 bool has_var_weights_{
false};
499 void AddBasis(
double* data_ptr, data_size_t num_row,
int num_col,
bool is_row_major) {
500 basis_ =
ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
511 has_var_weights_ =
true;
520 group_labels_ = group_labels;
521 has_group_labels_ =
true;
549 inline int32_t
GroupId(data_size_t row) {
return group_labels_[row];}
571 std::vector<int32_t> group_labels_;
572 bool has_basis_{
false};
573 bool has_var_weights_{
false};
574 bool has_group_labels_{
false};
Internal wrapper around Eigen::MatrixXd interface for multidimensional floating point data.
Definition data.h:133
data_size_t NumRows()
Number of rows in the object's internal Eigen::MatrixXd.
Definition data.h:180
int NumCols()
Number of columns in the object's internal Eigen::MatrixXd.
Definition data.h:182
ColumnMatrix(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Construct a new ColumnMatrix object from CSV file.
Eigen::MatrixXd & GetData()
Return a reference to the object's internal Eigen::MatrixXd, for interfaces that require a raw matrix...
Definition data.h:184
double GetElement(data_size_t row_num, int32_t col_num)
Returns the value stored at (row, col) in the object's internal Eigen::MatrixXd.
Definition data.h:161
void LoadData(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Update the data in a ColumnMatrix object from an in-memory data buffer. This will erase the existing ...
ColumnMatrix(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Construct a new ColumnMatrix object from in-memory data buffer.
void SetElement(data_size_t row_num, int32_t col_num, double value)
Update an observation in the object's internal Eigen::MatrixXd to a new value.
Definition data.h:169
Internal wrapper around Eigen::VectorXd interface for univariate floating point data....
Definition data.h:194
void SetElement(data_size_t row, double value)
Returns the value stored at position row in the object's internal Eigen::VectorXd.
Definition data.h:226
void LoadData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer. This will erase the existing ...
void OverwriteData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by substituting each value ob...
void SubtractFromData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by subtracting each value obt...
ColumnVector(double *data_ptr, data_size_t num_row)
Construct a new ColumnVector object from in-memory data buffer.
Eigen::VectorXd & GetData()
Return a reference to the object's internal Eigen::VectorXd, for interfaces that require a raw vector...
Definition data.h:261
data_size_t NumRows()
Number of rows in the object's internal Eigen::VectorXd.
Definition data.h:259
void AddToData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by adding each value obtained...
double GetElement(data_size_t row)
Returns the value stored at position row in the object's internal Eigen::VectorXd.
Definition data.h:219
ColumnVector(std::string filename, int32_t column_index, bool header=true, bool precise_float_parser=false)
Construct a new ColumnMatrix object from CSV file.
API for loading and accessing data used to sample tree ensembles The covariates / bases / weights use...
Definition data.h:272
double BasisValue(data_size_t row, int col)
Returns a dataset's basis value stored at (row, col)
Definition data.h:372
void AddVarianceWeights(double *data_ptr, data_size_t num_row)
Copy / load variance weights from raw memory buffer (often pointer to data in a R vector or numpy arr...
Definition data.h:310
data_size_t NumObservations()
Number of observations (rows) in the dataset.
Definition data.h:354
void SetCovariateValue(data_size_t row_id, int col, double new_value)
Update an observation in the internal covariate matrix to a new value.
Definition data.h:447
bool HasCovariates()
Whether or not a ForestDataset has (yet) loaded covariate data.
Definition data.h:348
void SetVarWeightValue(data_size_t row_id, double new_value, bool exponentiate=true)
Update an observation in the internal variance weight vector to a new value.
Definition data.h:468
void AddVarianceWeightsFromCSV(std::string filename, int32_t column_index, bool header=true, bool precise_float_parser=false)
Copy / load variance / case weights from CSV file.
Definition data.h:343
void AddCovariates(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load covariates from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:285
Eigen::MatrixXd & GetCovariates()
Return a reference to the raw Eigen::MatrixXd storing the covariate data.
Definition data.h:384
Eigen::MatrixXd & GetBasis()
Return a reference to the raw Eigen::MatrixXd storing the basis data.
Definition data.h:390
int NumCovariates()
Number of covariate columns in the dataset.
Definition data.h:356
Eigen::VectorXd & GetVarWeights()
Return a reference to the raw Eigen::VectorXd storing the variance weights.
Definition data.h:396
ForestDataset()
Default constructor. No data is loaded at construction time.
Definition data.h:275
void AddBasisFromCSV(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Copy / load basis matrix from CSV file.
Definition data.h:332
void UpdateBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Update the data in the internal basis matrix to new values stored in a raw double array.
Definition data.h:405
bool HasBasis()
Whether or not a ForestDataset has (yet) loaded basis data.
Definition data.h:350
double VarWeightValue(data_size_t row)
Returns a dataset's variance weight stored at element row
Definition data.h:378
void AddCovariatesFromCSV(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Copy / load covariates from CSV file.
Definition data.h:320
bool HasVarWeights()
Whether or not a ForestDataset has (yet) loaded variance weights.
Definition data.h:352
int NumBasis()
Number of bases in the dataset. This is 0 if the dataset has not been provided a basis matrix.
Definition data.h:358
void AddBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load basis matrix from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:299
double CovariateValue(data_size_t row, int col)
Returns a dataset's covariate value stored at (row, col)
Definition data.h:365
void UpdateVarWeights(double *data_ptr, data_size_t num_row, bool exponentiate=true)
Update the data in the internal variance weight vector to new values stored in a raw double array.
Definition data.h:430
void SetBasisValue(data_size_t row_id, int col, double new_value)
Update an observation in the internal basis matrix to a new value.
Definition data.h:457
API for loading and accessing data used to sample (additive) random effects.
Definition data.h:486
bool HasGroupLabels()
Whether or not a RandomEffectsDataset has (yet) loaded group labels.
Definition data.h:530
RandomEffectsDataset()
Default constructor. No data is loaded at construction time.
Definition data.h:489
void AddVarianceWeights(double *data_ptr, data_size_t num_row)
Copy / load variance weights from raw memory buffer (often pointer to data in a R vector or numpy arr...
Definition data.h:509
bool HasVarWeights()
Whether or not a RandomEffectsDataset has (yet) loaded variance weights.
Definition data.h:528
void AddGroupLabels(std::vector< int32_t > &group_labels)
Copy / load group indices for random effects.
Definition data.h:519
void AddBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load basis matrix from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:499
data_size_t NumObservations()
Number of observations (rows) in the dataset.
Definition data.h:524
std::vector< int32_t > & GetGroupLabels()
Return a reference to the raw std::vector storing the group labels.
Definition data.h:567
int32_t GroupId(data_size_t row)
Returns a dataset's group label stored at element row
Definition data.h:549
double BasisValue(data_size_t row, int col)
Returns a dataset's basis value stored at (row, col)
Definition data.h:537
double VarWeightValue(data_size_t row)
Returns a dataset's variance weight stored at element row
Definition data.h:543
Eigen::MatrixXd & GetBasis()
Return a reference to the raw Eigen::MatrixXd storing the basis data.
Definition data.h:555
Eigen::VectorXd & GetVarWeights()
Return a reference to the raw Eigen::VectorXd storing the variance weights.
Definition data.h:561
bool HasBasis()
Whether or not a RandomEffectsDataset has (yet) loaded basis data.
Definition data.h:526
static void ExtractMultipleFeaturesFromMemory(std::vector< std::string > *text_data, const Parser *parser, std::vector< int32_t > &column_indices, Eigen::MatrixXd &data, data_size_t num_rows)
Extract multiple features from the raw data loaded from a file into an Eigen::MatrixXd....
Definition data.h:35
static void ExtractSingleFeatureFromMemory(std::vector< std::string > *text_data, const Parser *parser, int32_t column_index, Eigen::VectorXd &data, data_size_t num_rows)
Extract a single feature from the raw data loaded from a file into an Eigen::VectorXd....
Definition data.h:77
Definition category_tracker.h:40