StochTree 0.1.1
Loading...
Searching...
No Matches
data.h
1
5#ifndef STOCHTREE_DATA_H_
6#define STOCHTREE_DATA_H_
7
8#include <Eigen/Dense>
9#include <stochtree/io.h>
10#include <stochtree/log.h>
11#include <stochtree/meta.h>
12
13namespace StochTree {
14
34static inline void ExtractMultipleFeaturesFromMemory(std::vector<std::string>* text_data, const Parser* parser,
35 std::vector<int32_t>& column_indices, Eigen::MatrixXd& data,
36 data_size_t num_rows) {
37 std::vector<std::pair<int, double>> oneline_features;
38 auto& ref_text_data = *text_data;
39 int feature_counter;
40 bool column_matched;
41 for (data_size_t i = 0; i < num_rows; ++i) {
42 // unpack the vector of textlines read from file into a vector of (int, double) tuples
43 oneline_features.clear();
44 parser->ParseOneLine(ref_text_data[i].c_str(), &oneline_features);
45
46 // free processed line:
47 ref_text_data[i].clear();
48
49 // unload the data from oneline_features vector into the dataset variables containers
50 int feature_counter = 0;
51 for (auto& inner_data : oneline_features) {
52 int feature_idx = inner_data.first;
53 column_matched = (std::find(column_indices.begin(), column_indices.end(), feature_idx)
54 != column_indices.end());
55 if (column_matched){
56 data(i, feature_counter) = inner_data.second;
57 feature_counter += 1;
58 }
59 }
60 }
61 // free text data after use
62 text_data->clear();
63}
64
76static inline void ExtractSingleFeatureFromMemory(std::vector<std::string>* text_data, const Parser* parser,
77 int32_t column_index, Eigen::VectorXd& data, data_size_t num_rows) {
78 std::vector<std::pair<int, double>> oneline_features;
79 auto& ref_text_data = *text_data;
80 bool column_matched;
81 for (data_size_t i = 0; i < num_rows; ++i) {
82 // unpack the vector of textlines read from file into a vector of (int, double) tuples
83 oneline_features.clear();
84 parser->ParseOneLine(ref_text_data[i].c_str(), &oneline_features);
85
86 // free processed line:
87 ref_text_data[i].clear();
88
89 // unload the data from oneline_features vector into the dataset variables containers
90 for (auto& inner_data : oneline_features) {
91 int feature_idx = inner_data.first;
92 if (column_index == feature_idx){
93 data(i) = inner_data.second;
94 }
95 }
96 }
97 // free text data after use
98 text_data->clear();
99}
100
101static inline std::vector<std::string> LoadTextDataToMemory(const char* filename, int* num_global_data, bool header) {
102 size_t file_load_progress_interval_bytes = size_t(10) * 1024 * 1024 * 1024;
103 TextReader<data_size_t> text_reader(filename, header, file_load_progress_interval_bytes);
104 // read all lines
105 *num_global_data = text_reader.ReadAllLines();
106 return std::move(text_reader.Lines());
107}
108
109static inline void FeatureUnpack(std::vector<int32_t>& categorical_variables, const char* var_id) {
110 std::string var_clean = Common::RemoveQuotationSymbol(Common::Trim(var_id));
111 int out;
112 bool success = Common::AtoiAndCheck(var_clean.c_str(), &out);
113 if (success) {
114 categorical_variables.push_back(out);
115 } else {
116 Log::Warning("Parsed variable index %s cannot be cast to an integer", var_clean.c_str());
117 }
118}
119
120static inline std::vector<int> Str2FeatureVec(const char* parameters) {
121 std::vector<int> feature_vec;
122 auto args = Common::Split(parameters, ",");
123 for (auto arg : args) {
124 FeatureUnpack(feature_vec, Common::Trim(arg).c_str());
125 }
126 return feature_vec;
127}
128
133 public:
134 ColumnMatrix() {}
143 ColumnMatrix(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major);
152 ColumnMatrix(std::string filename, std::string column_index_string, bool header = true, bool precise_float_parser = false);
153 ~ColumnMatrix() {}
160 double GetElement(data_size_t row_num, int32_t col_num) {return data_(row_num, col_num);}
168 void SetElement(data_size_t row_num, int32_t col_num, double value) {data_(row_num, col_num) = value;}
177 void LoadData(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major);
179 inline data_size_t NumRows() {return data_.rows();}
181 inline int NumCols() {return data_.cols();}
183 inline Eigen::MatrixXd& GetData() {return data_;}
184 private:
185 Eigen::MatrixXd data_;
186};
187
194 public:
195 ColumnVector() {}
202 ColumnVector(double* data_ptr, data_size_t num_row);
211 ColumnVector(std::string filename, int32_t column_index, bool header = true, bool precise_float_parser = false);
212 ~ColumnVector() {}
218 double GetElement(data_size_t row) {return data_(row);}
225 void SetElement(data_size_t row, double value) {data_(row) = value;}
232 void LoadData(double* data_ptr, data_size_t num_row);
240 void AddToData(double* data_ptr, data_size_t num_row);
248 void SubtractFromData(double* data_ptr, data_size_t num_row);
256 void OverwriteData(double* data_ptr, data_size_t num_row);
258 inline data_size_t NumRows() {return data_.size();}
260 inline Eigen::VectorXd& GetData() {return data_;}
261 private:
262 Eigen::VectorXd data_;
263 void UpdateData(double* data_ptr, data_size_t num_row, std::function<double(double, double)> op);
264};
265
272 public:
275 ~ForestDataset() {}
284 void AddCovariates(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
285 covariates_ = ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
286 num_observations_ = num_row;
287 num_covariates_ = num_col;
288 has_covariates_ = true;
289 }
298 void AddBasis(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
299 basis_ = ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
300 num_basis_ = num_col;
301 has_basis_ = true;
302 }
309 void AddVarianceWeights(double* data_ptr, data_size_t num_row) {
310 var_weights_ = ColumnVector(data_ptr, num_row);
311 has_var_weights_ = true;
312 }
319 void AddCovariatesFromCSV(std::string filename, std::string column_index_string, bool header = true, bool precise_float_parser = false) {
320 covariates_ = ColumnMatrix(filename, column_index_string, header, precise_float_parser);
321 num_observations_ = covariates_.NumRows();
322 num_covariates_ = covariates_.NumCols();
323 has_covariates_ = true;
324 }
331 void AddBasisFromCSV(std::string filename, std::string column_index_string, bool header = true, bool precise_float_parser = false) {
332 basis_ = ColumnMatrix(filename, column_index_string, header, precise_float_parser);
333 num_basis_ = basis_.NumCols();
334 has_basis_ = true;
335 }
342 void AddVarianceWeightsFromCSV(std::string filename, int32_t column_index, bool header = true, bool precise_float_parser = false) {
343 var_weights_ = ColumnVector(filename, column_index, header, precise_float_parser);
344 has_var_weights_ = true;
345 }
347 inline bool HasCovariates() {return has_covariates_;}
349 inline bool HasBasis() {return has_basis_;}
351 inline bool HasVarWeights() {return has_var_weights_;}
353 inline data_size_t NumObservations() {return num_observations_;}
355 inline int NumCovariates() {return num_covariates_;}
357 inline int NumBasis() {return num_basis_;}
364 inline double CovariateValue(data_size_t row, int col) {return covariates_.GetElement(row, col);}
371 inline double BasisValue(data_size_t row, int col) {return basis_.GetElement(row, col);}
377 inline double VarWeightValue(data_size_t row) {return var_weights_.GetElement(row);}
383 inline Eigen::MatrixXd& GetCovariates() {return covariates_.GetData();}
389 inline Eigen::MatrixXd& GetBasis() {return basis_.GetData();}
395 inline Eigen::VectorXd& GetVarWeights() {return var_weights_.GetData();}
404 void UpdateBasis(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
405 CHECK(has_basis_);
406 CHECK_EQ(num_col, num_basis_);
407 // Copy data from R / Python process memory to Eigen matrix
408 double temp_value;
409 for (data_size_t i = 0; i < num_row; ++i) {
410 for (int j = 0; j < num_col; ++j) {
411 if (is_row_major){
412 // Numpy 2-d arrays are stored in "row major" order
413 temp_value = static_cast<double>(*(data_ptr + static_cast<data_size_t>(num_col) * i + j));
414 } else {
415 // R matrices are stored in "column major" order
416 temp_value = static_cast<double>(*(data_ptr + static_cast<data_size_t>(num_row) * j + i));
417 }
418 basis_.SetElement(i, j, temp_value);
419 }
420 }
421 }
429 void UpdateVarWeights(double* data_ptr, data_size_t num_row, bool exponentiate = true) {
430 CHECK(has_var_weights_);
431 // Copy data from R / Python process memory to Eigen vector
432 double temp_value;
433 for (data_size_t i = 0; i < num_row; ++i) {
434 if (exponentiate) temp_value = std::exp(static_cast<double>(*(data_ptr + i)));
435 else temp_value = static_cast<double>(*(data_ptr + i));
436 var_weights_.SetElement(i, temp_value);
437 }
438 }
446 void SetCovariateValue(data_size_t row_id, int col, double new_value) {
447 covariates_.SetElement(row_id, col, new_value);
448 }
456 void SetBasisValue(data_size_t row_id, int col, double new_value) {
457 CHECK(has_basis_);
458 basis_.SetElement(row_id, col, new_value);
459 }
467 void SetVarWeightValue(data_size_t row_id, double new_value, bool exponentiate = true) {
468 CHECK(has_var_weights_);
469 if (exponentiate) var_weights_.SetElement(row_id, std::exp(new_value));
470 else var_weights_.SetElement(row_id, new_value);
471 }
472 private:
473 ColumnMatrix covariates_;
474 ColumnMatrix basis_;
475 ColumnVector var_weights_;
476 data_size_t num_observations_{0};
477 int num_covariates_{0};
478 int num_basis_{0};
479 bool has_covariates_{false};
480 bool has_basis_{false};
481 bool has_var_weights_{false};
482};
483
486 public:
498 void AddBasis(double* data_ptr, data_size_t num_row, int num_col, bool is_row_major) {
499 basis_ = ColumnMatrix(data_ptr, num_row, num_col, is_row_major);
500 has_basis_ = true;
501 }
508 void AddVarianceWeights(double* data_ptr, data_size_t num_row) {
509 var_weights_ = ColumnVector(data_ptr, num_row);
510 has_var_weights_ = true;
511 }
518 void AddGroupLabels(std::vector<int32_t>& group_labels) {
519 group_labels_ = group_labels;
520 has_group_labels_ = true;
521 }
523 inline data_size_t NumObservations() {return basis_.NumRows();}
525 inline int NumBases() {return basis_.NumCols();}
527 inline bool HasBasis() {return has_basis_;}
529 inline bool HasVarWeights() {return has_var_weights_;}
531 inline bool HasGroupLabels() {return has_group_labels_;}
538 inline double BasisValue(data_size_t row, int col) {return basis_.GetElement(row, col);}
544 inline double VarWeightValue(data_size_t row) {return var_weights_.GetElement(row);}
550 inline int32_t GroupId(data_size_t row) {return group_labels_[row];}
556 inline Eigen::MatrixXd& GetBasis() {return basis_.GetData();}
562 inline Eigen::VectorXd& GetVarWeights() {return var_weights_.GetData();}
568 inline std::vector<int32_t>& GetGroupLabels() {return group_labels_;}
569 private:
570 ColumnMatrix basis_;
571 ColumnVector var_weights_;
572 std::vector<int32_t> group_labels_;
573 bool has_basis_{false};
574 bool has_var_weights_{false};
575 bool has_group_labels_{false};
576};
577
// end of data_group
579
580} // namespace StochTree
581
582#endif // STOCHTREE_DATA_H_
Internal wrapper around Eigen::MatrixXd interface for multidimensional floating point data.
Definition data.h:132
data_size_t NumRows()
Number of rows in the object's internal Eigen::MatrixXd.
Definition data.h:179
int NumCols()
Number of columns in the object's internal Eigen::MatrixXd.
Definition data.h:181
ColumnMatrix(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Construct a new ColumnMatrix object from CSV file.
Eigen::MatrixXd & GetData()
Return a reference to the object's internal Eigen::MatrixXd, for interfaces that require a raw matrix...
Definition data.h:183
double GetElement(data_size_t row_num, int32_t col_num)
Returns the value stored at (row, col) in the object's internal Eigen::MatrixXd.
Definition data.h:160
void LoadData(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Update the data in a ColumnMatrix object from an in-memory data buffer. This will erase the existing ...
ColumnMatrix(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Construct a new ColumnMatrix object from in-memory data buffer.
void SetElement(data_size_t row_num, int32_t col_num, double value)
Update an observation in the object's internal Eigen::MatrixXd to a new value.
Definition data.h:168
Internal wrapper around Eigen::VectorXd interface for univariate floating point data....
Definition data.h:193
void SetElement(data_size_t row, double value)
Returns the value stored at position row in the object's internal Eigen::VectorXd.
Definition data.h:225
void LoadData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer. This will erase the existing ...
void OverwriteData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by substituting each value ob...
void SubtractFromData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by subtracting each value obt...
ColumnVector(double *data_ptr, data_size_t num_row)
Construct a new ColumnVector object from in-memory data buffer.
Eigen::VectorXd & GetData()
Return a reference to the object's internal Eigen::VectorXd, for interfaces that require a raw vector...
Definition data.h:260
data_size_t NumRows()
Number of rows in the object's internal Eigen::VectorXd.
Definition data.h:258
void AddToData(double *data_ptr, data_size_t num_row)
Update the data in a ColumnVector object from an in-memory data buffer, by adding each value obtained...
double GetElement(data_size_t row)
Returns the value stored at position row in the object's internal Eigen::VectorXd.
Definition data.h:218
ColumnVector(std::string filename, int32_t column_index, bool header=true, bool precise_float_parser=false)
Construct a new ColumnMatrix object from CSV file.
API for loading and accessing data used to sample tree ensembles The covariates / bases / weights use...
Definition data.h:271
double BasisValue(data_size_t row, int col)
Returns a dataset's basis value stored at (row, col)
Definition data.h:371
void AddVarianceWeights(double *data_ptr, data_size_t num_row)
Copy / load variance weights from raw memory buffer (often pointer to data in a R vector or numpy arr...
Definition data.h:309
data_size_t NumObservations()
Number of observations (rows) in the dataset.
Definition data.h:353
void SetCovariateValue(data_size_t row_id, int col, double new_value)
Update an observation in the internal covariate matrix to a new value.
Definition data.h:446
bool HasCovariates()
Whether or not a ForestDataset has (yet) loaded covariate data.
Definition data.h:347
void SetVarWeightValue(data_size_t row_id, double new_value, bool exponentiate=true)
Update an observation in the internal variance weight vector to a new value.
Definition data.h:467
void AddVarianceWeightsFromCSV(std::string filename, int32_t column_index, bool header=true, bool precise_float_parser=false)
Copy / load variance / case weights from CSV file.
Definition data.h:342
void AddCovariates(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load covariates from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:284
Eigen::MatrixXd & GetCovariates()
Return a reference to the raw Eigen::MatrixXd storing the covariate data.
Definition data.h:383
Eigen::MatrixXd & GetBasis()
Return a reference to the raw Eigen::MatrixXd storing the basis data.
Definition data.h:389
int NumCovariates()
Number of covariate columns in the dataset.
Definition data.h:355
Eigen::VectorXd & GetVarWeights()
Return a reference to the raw Eigen::VectorXd storing the variance weights.
Definition data.h:395
ForestDataset()
Default constructor. No data is loaded at construction time.
Definition data.h:274
void AddBasisFromCSV(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Copy / load basis matrix from CSV file.
Definition data.h:331
void UpdateBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Update the data in the internal basis matrix to new values stored in a raw double array.
Definition data.h:404
bool HasBasis()
Whether or not a ForestDataset has (yet) loaded basis data.
Definition data.h:349
double VarWeightValue(data_size_t row)
Returns a dataset's variance weight stored at element row
Definition data.h:377
void AddCovariatesFromCSV(std::string filename, std::string column_index_string, bool header=true, bool precise_float_parser=false)
Copy / load covariates from CSV file.
Definition data.h:319
bool HasVarWeights()
Whether or not a ForestDataset has (yet) loaded variance weights.
Definition data.h:351
int NumBasis()
Number of bases in the dataset. This is 0 if the dataset has not been provided a basis matrix.
Definition data.h:357
void AddBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load basis matrix from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:298
double CovariateValue(data_size_t row, int col)
Returns a dataset's covariate value stored at (row, col)
Definition data.h:364
void UpdateVarWeights(double *data_ptr, data_size_t num_row, bool exponentiate=true)
Update the data in the internal variance weight vector to new values stored in a raw double array.
Definition data.h:429
void SetBasisValue(data_size_t row_id, int col, double new_value)
Update an observation in the internal basis matrix to a new value.
Definition data.h:456
API for loading and accessing data used to sample (additive) random effects.
Definition data.h:485
bool HasGroupLabels()
Whether or not a RandomEffectsDataset has (yet) loaded group labels.
Definition data.h:531
RandomEffectsDataset()
Default constructor. No data is loaded at construction time.
Definition data.h:488
void AddVarianceWeights(double *data_ptr, data_size_t num_row)
Copy / load variance weights from raw memory buffer (often pointer to data in a R vector or numpy arr...
Definition data.h:508
bool HasVarWeights()
Whether or not a RandomEffectsDataset has (yet) loaded variance weights.
Definition data.h:529
void AddGroupLabels(std::vector< int32_t > &group_labels)
Copy / load group indices for random effects.
Definition data.h:518
void AddBasis(double *data_ptr, data_size_t num_row, int num_col, bool is_row_major)
Copy / load basis matrix from raw memory buffer (often pointer to data in a R matrix or numpy array)
Definition data.h:498
data_size_t NumObservations()
Number of observations (rows) in the dataset.
Definition data.h:523
std::vector< int32_t > & GetGroupLabels()
Return a reference to the raw std::vector storing the group labels.
Definition data.h:568
int32_t GroupId(data_size_t row)
Returns a dataset's group label stored at element row
Definition data.h:550
double BasisValue(data_size_t row, int col)
Returns a dataset's basis value stored at (row, col)
Definition data.h:538
double VarWeightValue(data_size_t row)
Returns a dataset's variance weight stored at element row
Definition data.h:544
int NumBases()
Number of columns of the basis vector in the dataset.
Definition data.h:525
Eigen::MatrixXd & GetBasis()
Return a reference to the raw Eigen::MatrixXd storing the basis data.
Definition data.h:556
Eigen::VectorXd & GetVarWeights()
Return a reference to the raw Eigen::VectorXd storing the variance weights.
Definition data.h:562
bool HasBasis()
Whether or not a RandomEffectsDataset has (yet) loaded basis data.
Definition data.h:527
static void ExtractMultipleFeaturesFromMemory(std::vector< std::string > *text_data, const Parser *parser, std::vector< int32_t > &column_indices, Eigen::MatrixXd &data, data_size_t num_rows)
Extract multiple features from the raw data loaded from a file into an Eigen::MatrixXd....
Definition data.h:34
static void ExtractSingleFeatureFromMemory(std::vector< std::string > *text_data, const Parser *parser, int32_t column_index, Eigen::VectorXd &data, data_size_t num_rows)
Extract a single feature from the raw data loaded from a file into an Eigen::VectorXd....
Definition data.h:76
Definition category_tracker.h:36