StochTree 0.0.1
Loading...
Searching...
No Matches
data.h
1
5#ifndef STOCHTREE_DATA_H_
6#define STOCHTREE_DATA_H_
7
8#include <Eigen/Dense>
9#include <stochtree/io.h>
10#include <stochtree/log.h>
11#include <stochtree/meta.h>
12#include <memory>
13
14namespace StochTree {
15
35static inline void ExtractMultipleFeaturesFromMemory(std::vector<std::string>* text_data, const Parser* parser,
36 std::vector<int32_t>& column_indices, Eigen::MatrixXd& data,
37 data_size_t num_rows) {
38 std::vector<std::pair<int, double>> oneline_features;
39 auto& ref_text_data = *text_data;
40 int feature_counter;
41 bool column_matched;
42 for (data_size_t i = 0; i < num_rows; ++i) {
43 // unpack the vector of textlines read from file into a vector of (int, double) tuples
44 oneline_features.clear();
45 parser->ParseOneLine(ref_text_data[i].c_str(), &oneline_features);
46
47 // free processed line:
48 ref_text_data[i].clear();
49
50 // unload the data from oneline_features vector into the dataset variables containers
51 int feature_counter = 0;
52 for (auto& inner_data : oneline_features) {
53 int feature_idx = inner_data.first;
54 column_matched = (std::find(column_indices.begin(), column_indices.end(), feature_idx)
55 != column_indices.end());
56 if (column_matched){
57 data(i, feature_counter) = inner_data.second;
58 feature_counter += 1;
59 }
60 }
61 }
62 // free text data after use
63 text_data->clear();
64}
65
77static inline void ExtractSingleFeatureFromMemory(std::vector<std::string>* text_data, const Parser* parser,
78 int32_t column_index, Eigen::VectorXd& data, data_size_t num_rows) {
79 std::vector<std::pair<int, double>> oneline_features;
80 auto& ref_text_data = *text_data;
81 bool column_matched;
82 for (data_size_t i = 0; i < num_rows; ++i) {
83 // unpack the vector of textlines read from file into a vector of (int, double) tuples
84 oneline_features.clear();
85 parser->ParseOneLine(ref_text_data[i].c_str(), &oneline_features);
86
87 // free processed line:
88 ref_text_data[i].clear();
89
90 // unload the data from oneline_features vector into the dataset variables containers
91 for (auto& inner_data : oneline_features) {
92 int feature_idx = inner_data.first;
93 if (column_index == feature_idx){
94 data(i) = inner_data.second;
95 }
96 }
97 }
98 // free text data after use
99 text_data->clear();
100}
101
102static inline std::vector<std::string> LoadTextDataToMemory(const char* filename, int* num_global_data, bool header) {
103 size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024;
104 TextReader<data_size_t> text_reader(filename, header, file_load_progress_interval_bytes);
105 // read all lines
106 *num_global_data = text_reader.ReadAllLines();
107 return std::move(text_reader.Lines());
108}
109
110static inline void FeatureUnpack(std::vector<int32_t>& categorical_variables, const char* var_id) {
111 std::string var_clean = Common::RemoveQuotationSymbol(Common::Trim(var_id));
112 int out;
113 bool success = Common::AtoiAndCheck(var_clean.c_str(), &out);
114 if (success) {
115 categorical_variables.push_back(out);
116 } else {
117 Log::Warning("Parsed variable index %s cannot be cast to an integer", var_clean.c_str());
118 }
119}
120
121static inline std::vector<int> Str2FeatureVec(const char* parameters) {
122 std::vector<int> feature_vec;
123 auto args = Common::Split(parameters, ",");
124 for (auto arg : args) {
125 FeatureUnpack(feature_vec, Common::Trim(arg).c_str());
126 }
127 return feature_vec;
128}
129
134 public:
135 ColumnMatrix() {}
144 ColumnMatrix(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major);
153 ColumnMatrix(std::string filename, std::string column_index_string, bool header = true, bool precise_float_parser = false);
154 ~ColumnMatrix() {}
161 double GetElement(data_size_t row_num, int32_t col_num) {return data_(row_num, col_num);}
169 void SetElement(data_size_t row_num, int32_t col_num, double value) {data_(row_num, col_num) = value;}
178 void LoadData(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major);
180 inline data_size_t NumRows() {return data_.rows();}
182 inline int NumCols() {return data_.cols();}
184 inline Eigen::MatrixXd& GetData() {return data_;}
185 private:
186 Eigen::MatrixXd data_;
187};
188
195 public:
196 ColumnVector() {}
203 ColumnVector(double* data_ptr, data_size_t num_row);
212 ColumnVector(std::string filename, int32_t column_index, bool header = true, bool precise_float_parser = false);
213 ~ColumnVector() {}
219 double GetElement(data_size_t row) {return data_(row);}
226 void SetElement(data_size_t row, double value) {data_(row) = value;}
233 void LoadData(double* data_ptr, data_size_t num_row);
241 void AddToData(double* data_ptr, data_size_t num_row);
249 void SubtractFromData(double* data_ptr, data_size_t num_row);
257 void OverwriteData(double* data_ptr, data_size_t num_row);
259 inline data_size_t NumRows() {return data_.size();}
261 inline Eigen::VectorXd& GetData() {return data_;}
262 private:
263 Eigen::VectorXd data_;
264 void UpdateData(double* data_ptr, data_size_t num_row, std::function<double(double, double)> op);
265};
266
273 public:
276 ~ForestDataset() {}
285 void AddCovariates(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
286 covariates_ = ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
287 num_observations_ = num_row;
288 num_covariates_ = num_col;
289 has_covariates_ = true;
290 }
299 void AddBasis(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
300 basis_ = ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
301 num_basis_ = num_col;
302 has_basis_ = true;
303 }
310 void AddVarianceWeights(double* data_ptr, data_size_t num_row) {
311 var_weights_ = ColumnVector(data_ptr, num_row);
312 has_var_weights_ = true;
313 }
320 void AddCovariatesFromCSV(std::string filename, std::string column_index_string, bool header = true, bool precise_float_parser = false) {
321 covariates_ = ColumnMatrix(filename, column_index_string, header, precise_float_parser);
322 num_observations_ = covariates_.NumRows();
323 num_covariates_ = covariates_.NumCols();
324 has_covariates_ = true;
325 }
332 void AddBasisFromCSV(std::string filename, std::string column_index_string, bool header = true, bool precise_float_parser = false) {
333 basis_ = ColumnMatrix(filename, column_index_string, header, precise_float_parser);
334 num_basis_ = basis_.NumCols();
335 has_basis_ = true;
336 }
343 void AddVarianceWeightsFromCSV(std::string filename, int32_t column_index, bool header = true, bool precise_float_parser = false) {
344 var_weights_ = ColumnVector(filename, column_index, header, precise_float_parser);
345 has_var_weights_ = true;
346 }
348 inline bool HasCovariates() {return has_covariates_;}
350 inline bool HasBasis() {return has_basis_;}
352 inline bool HasVarWeights() {return has_var_weights_;}
354 inline data_size_t NumObservations() {return num_observations_;}
356 inline int NumCovariates() {return num_covariates_;}
358 inline int NumBasis() {return num_basis_;}
365 inline double CovariateValue(data_size_t row, int col) {return covariates_.GetElement(row, col);}
372 inline double BasisValue(data_size_t row, int col) {return basis_.GetElement(row, col);}
378 inline double VarWeightValue(data_size_t row) {return var_weights_.GetElement(row);}
384 inline Eigen::MatrixXd& GetCovariates() {return covariates_.GetData();}
390 inline Eigen::MatrixXd& GetBasis() {return basis_.GetData();}
396 inline Eigen::VectorXd& GetVarWeights() {return var_weights_.GetData();}
405 void UpdateBasis(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
406 CHECK(has_basis_);
407 CHECK_EQ(num_col, num_basis_);
408 // Copy data from R / Python process memory to Eigen matrix
409 double temp_value;
410 for (data_size_t i = 0; i < num_row; ++i) {
411 for (int j = 0; j < num_col; ++j) {
412 if (is_row_major){
413 // Numpy 2-d arrays are stored in "row major" order
414 temp_value = static_cast<double>(*(data_ptr + static_cast<data_size_t>(num_col) * i + j));
415 } else {
416 // R matrices are stored in "column major" order
417 temp_value = static_cast<double>(*(data_ptr + static_cast<data_size_t>(num_row) * j + i));
418 }
419 basis_.SetElement(i, j, temp_value);
420 }
421 }
422 }
430 void UpdateVarWeights(double* data_ptr, data_size_t num_row, bool exponentiate = true) {
431 CHECK(has_var_weights_);
432 // Copy data from R / Python process memory to Eigen vector
433 double temp_value;
434 for (data_size_t i = 0; i < num_row; ++i) {
435 if (exponentiate) temp_value = std::exp(static_cast<double>(*(data_ptr + i)));
436 else temp_value = static_cast<double>(*(data_ptr + i));
437 var_weights_.SetElement(i, temp_value);
438 }
439 }
447 void SetCovariateValue(data_size_t row_id, int col, double new_value) {
448 covariates_.SetElement(row_id, col, new_value);
449 }
457 void SetBasisValue(data_size_t row_id, int col, double new_value) {
458 CHECK(has_basis_);
459 basis_.SetElement(row_id, col, new_value);
460 }
468 void SetVarWeightValue(data_size_t row_id, double new_value, bool exponentiate = true) {
469 CHECK(has_var_weights_);
470 if (exponentiate) var_weights_.SetElement(row_id, std::exp(new_value));
471 else var_weights_.SetElement(row_id, new_value);
472 }
473 private:
474 ColumnMatrix covariates_;
475 ColumnMatrix basis_;
476 ColumnVector var_weights_;
477 data_size_t num_observations_{0};
478 int num_covariates_{0};
479 int num_basis_{0};
480 bool has_covariates_{false};
481 bool has_basis_{false};
482 bool has_var_weights_{false};
483};
484
487 public:
499 void AddBasis(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
500 basis_ = ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
501 has_basis_ = true;
502 }
509 void AddVarianceWeights(double* data_ptr, data_size_t num_row) {
510 var_weights_ = ColumnVector(data_ptr, num_row);
511 has_var_weights_ = true;
512 }
519 void AddGroupLabels(std::vector<int32_t>& group_labels) {
520 group_labels_ = group_labels;
521 has_group_labels_ = true;
522 }
524 inline data_size_t NumObservations() {return basis_.NumRows();}
526 inline bool HasBasis() {return has_basis_;}
528 inline bool HasVarWeights() {return has_var_weights_;}
530 inline bool HasGroupLabels() {return has_group_labels_;}
537 inline double BasisValue(data_size_t row, int col) {return basis_.GetElement(row, col);}
543 inline double VarWeightValue(data_size_t row) {return var_weights_.GetElement(row);}
549 inline int32_t GroupId(data_size_t row) {return group_labels_[row];}
555 inline Eigen::MatrixXd& GetBasis() {return basis_.GetData();}
561 inline Eigen::VectorXd& GetVarWeights() {return var_weights_.GetData();}
567 inline std::vector<int32_t>& GetGroupLabels() {return group_labels_;}
568 private:
569 ColumnMatrix basis_;
570 ColumnVector var_weights_;
571 std::vector<int32_t> group_labels_;
572 bool has_basis_{false};
573 bool has_var_weights_{false};
574 bool has_group_labels_{false};
575};
576
// end of data_group
578
579} // namespace StochTree
580
581#endif // STOCHTREE_DATA_H_
Internal wrapper around Eigen::MatrixXd interface for multidimensional floating point data.
Definition data.h:133
data_size_t NumRows()
Number of rows in the object's internal Eigen::MatrixXd.
Definition data.h:180
int NumCols()
Number of columns in the object's internal Eigen::MatrixXd.
Definition data.h:182
ColumnMatrix(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Construct a new ColumnMatrix object from CSV file.
Eigen::MatrixXd & GetData()
Return a reference to the object's internal Eigen::MatrixXd, for interfaces that require a raw matrix...
Definition data.h:184
double GetElement(data_size_t row_num, int32_t col_num)
Returns the value stored at (row, col) in the object's internal Eigen::MatrixXd.
Definition data.h:161
void LoadData(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Update the data in a ColumnMatrix object from an in-memory data buffer. This will erase the existing ...
ColumnMatrix(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Construct a new ColumnMatrix object from in-memory data buffer.
void SetElement(data_size_t row_num, int32_t col_num, double value)
Update an observation in the object's internal Eigen::MatrixXd to a new value.
Definition data.h:169
Internal wrapper around Eigen::VectorXd interface for univariate floating point data....
Definition data.h:194
void SetElement(data_size_t row, double value)
Returns the value stored at position row in the object's internal Eigen::VectorXd.
Definition data.h:226
void LoadData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer. This will erase the existing ...
void OverwriteData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by substituting each value ob...
void SubtractFromData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by subtracting each value obt...
ColumnVector(double *data_ptr, data_size_t num_row)
Construct a new ColumnVector object from in-memory data buffer.
Eigen::VectorXd & GetData()
Return a reference to the object's internal Eigen::VectorXd, for interfaces that require a raw vector...
Definition data.h:261
data_size_t NumRows()
Number of rows in the object's internal Eigen::VectorXd.
Definition data.h:259
void AddToData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by adding each value obtained...
double GetElement(data_size_t row)
Returns the value stored at position row in the object's internal Eigen::VectorXd.
Definition data.h:219
ColumnVector(std::string filename, int32_t column_index, bool header=true, bool precise_float_parser=false)
Construct a new ColumnMatrix object from CSV file.
API for loading and accessing data used to sample tree ensembles The covariates / bases / weights use...
Definition data.h:272
double BasisValue(data_size_t row, int col)
Returns a dataset's basis value stored at (row, col)
Definition data.h:372
void AddVarianceWeights(double *data_ptr, data_size_t num_row)
Copy / load variance weights from raw memory buffer (often pointer to data in a R vector or numpy arr...
Definition data.h:310
data_size_t NumObservations()
Number of observations (rows) in the dataset.
Definition data.h:354
void SetCovariateValue(data_size_t row_id, int col, double new_value)
Update an observation in the internal covariate matrix to a new value.
Definition data.h:447
bool HasCovariates()
Whether or not a ForestDataset has (yet) loaded covariate data.
Definition data.h:348
void SetVarWeightValue(data_size_t row_id, double new_value, bool exponentiate=true)
Update an observation in the internal variance weight vector to a new value.
Definition data.h:468
void AddVarianceWeightsFromCSV(std::string filename, int32_t column_index, bool header=true, bool precise_float_parser=false)
Copy / load variance / case weights from CSV file.
Definition data.h:343
void AddCovariates(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load covariates from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:285
Eigen::MatrixXd & GetCovariates()
Return a reference to the raw Eigen::MatrixXd storing the covariate data.
Definition data.h:384
Eigen::MatrixXd & GetBasis()
Return a reference to the raw Eigen::MatrixXd storing the basis data.
Definition data.h:390
int NumCovariates()
Number of covariate columns in the dataset.
Definition data.h:356
Eigen::VectorXd & GetVarWeights()
Return a reference to the raw Eigen::VectorXd storing the variance weights.
Definition data.h:396
ForestDataset()
Default constructor. No data is loaded at construction time.
Definition data.h:275
void AddBasisFromCSV(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Copy / load basis matrix from CSV file.
Definition data.h:332
void UpdateBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Update the data in the internal basis matrix to new values stored in a raw double array.
Definition data.h:405
bool HasBasis()
Whether or not a ForestDataset has (yet) loaded basis data.
Definition data.h:350
double VarWeightValue(data_size_t row)
Returns a dataset's variance weight stored at element row
Definition data.h:378
void AddCovariatesFromCSV(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Copy / load covariates from CSV file.
Definition data.h:320
bool HasVarWeights()
Whether or not a ForestDataset has (yet) loaded variance weights.
Definition data.h:352
int NumBasis()
Number of bases in the dataset. This is 0 if the dataset has not been provided a basis matrix.
Definition data.h:358
void AddBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load basis matrix from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:299
double CovariateValue(data_size_t row, int col)
Returns a dataset's covariate value stored at (row, col)
Definition data.h:365
void UpdateVarWeights(double *data_ptr, data_size_t num_row, bool exponentiate=true)
Update the data in the internal variance weight vector to new values stored in a raw double array.
Definition data.h:430
void SetBasisValue(data_size_t row_id, int col, double new_value)
Update an observation in the internal basis matrix to a new value.
Definition data.h:457
API for loading and accessing data used to sample (additive) random effects.
Definition data.h:486
bool HasGroupLabels()
Whether or not a RandomEffectsDataset has (yet) loaded group labels.
Definition data.h:530
RandomEffectsDataset()
Default constructor. No data is loaded at construction time.
Definition data.h:489
void AddVarianceWeights(double *data_ptr, data_size_t num_row)
Copy / load variance weights from raw memory buffer (often pointer to data in a R vector or numpy arr...
Definition data.h:509
bool HasVarWeights()
Whether or not a RandomEffectsDataset has (yet) loaded variance weights.
Definition data.h:528
void AddGroupLabels(std::vector< int32_t > &group_labels)
Copy / load group indices for random effects.
Definition data.h:519
void AddBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load basis matrix from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:499
data_size_t NumObservations()
Number of observations (rows) in the dataset.
Definition data.h:524
std::vector< int32_t > & GetGroupLabels()
Return a reference to the raw std::vector storing the group labels.
Definition data.h:567
int32_t GroupId(data_size_t row)
Returns a dataset's group label stored at element row
Definition data.h:549
double BasisValue(data_size_t row, int col)
Returns a dataset's basis value stored at (row, col)
Definition data.h:537
double VarWeightValue(data_size_t row)
Returns a dataset's variance weight stored at element row
Definition data.h:543
Eigen::MatrixXd & GetBasis()
Return a reference to the raw Eigen::MatrixXd storing the basis data.
Definition data.h:555
Eigen::VectorXd & GetVarWeights()
Return a reference to the raw Eigen::VectorXd storing the variance weights.
Definition data.h:561
bool HasBasis()
Whether or not a RandomEffectsDataset has (yet) loaded basis data.
Definition data.h:526
static void ExtractMultipleFeaturesFromMemory(std::vector< std::string > *text_data, const Parser *parser, std::vector< int32_t > &column_indices, Eigen::MatrixXd &data, data_size_t num_rows)
Extract multiple features from the raw data loaded from a file into an Eigen::MatrixXd....
Definition data.h:35
static void ExtractSingleFeatureFromMemory(std::vector< std::string > *text_data, const Parser *parser, int32_t column_index, Eigen::VectorXd &data, data_size_t num_rows)
Extract a single feature from the raw data loaded from a file into an Eigen::VectorXd....
Definition data.h:77
Definition category_tracker.h:40