Multivariate Treatment Causal Inference¶
Load necessary libraries
In [1]:
Copied!
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from stochtree import BCFModel
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from stochtree import BCFModel
Generate sample data
In [2]:
Copied!
# RNG
rng = np.random.default_rng()
# Generate covariates and basis
n = 500
p_X = 5
X = rng.uniform(0, 1, (n, p_X))
pi_X = np.c_[0.25 + 0.5 * X[:, 0], 0.75 - 0.5 * X[:, 1]]
# Z = rng.uniform(0, 1, (n, 2))
Z = rng.binomial(1, pi_X, (n, 2))
# Define the outcome mean functions (prognostic and treatment effects)
mu_X = pi_X[:, 0] * 5 + pi_X[:, 1] * 2 + 2 * X[:, 2]
tau_X = np.stack((X[:, 1], X[:, 2]), axis=-1)
# Generate outcome
epsilon = rng.normal(0, 1, n)
treatment_term = np.multiply(tau_X, Z).sum(axis=1)
y = mu_X + treatment_term + epsilon
# RNG
rng = np.random.default_rng()
# Generate covariates and basis
n = 500
p_X = 5
X = rng.uniform(0, 1, (n, p_X))
pi_X = np.c_[0.25 + 0.5 * X[:, 0], 0.75 - 0.5 * X[:, 1]]
# Z = rng.uniform(0, 1, (n, 2))
Z = rng.binomial(1, pi_X, (n, 2))
# Define the outcome mean functions (prognostic and treatment effects)
mu_X = pi_X[:, 0] * 5 + pi_X[:, 1] * 2 + 2 * X[:, 2]
tau_X = np.stack((X[:, 1], X[:, 2]), axis=-1)
# Generate outcome
epsilon = rng.normal(0, 1, n)
treatment_term = np.multiply(tau_X, Z).sum(axis=1)
y = mu_X + treatment_term + epsilon
Test-train split
In [3]:
Copied!
sample_inds = np.arange(n)
train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)
X_train = X[train_inds, :]
X_test = X[test_inds, :]
Z_train = Z[train_inds, :]
Z_test = Z[test_inds, :]
y_train = y[train_inds]
y_test = y[test_inds]
pi_train = pi_X[train_inds]
pi_test = pi_X[test_inds]
mu_train = mu_X[train_inds]
mu_test = mu_X[test_inds]
tau_train = tau_X[train_inds, :]
tau_test = tau_X[test_inds, :]
sample_inds = np.arange(n)
train_inds, test_inds = train_test_split(sample_inds, test_size=0.5)
X_train = X[train_inds, :]
X_test = X[test_inds, :]
Z_train = Z[train_inds, :]
Z_test = Z[test_inds, :]
y_train = y[train_inds]
y_test = y[test_inds]
pi_train = pi_X[train_inds]
pi_test = pi_X[test_inds]
mu_train = mu_X[train_inds]
mu_test = mu_X[test_inds]
tau_train = tau_X[train_inds, :]
tau_test = tau_X[test_inds, :]
Run BCF
In [4]:
Copied!
bcf_model = BCFModel()
bcf_model.sample(
X_train,
Z_train,
y_train,
pi_train,
X_test,
Z_test,
pi_test,
num_gfr=10,
num_mcmc=100,
)
bcf_model = BCFModel()
bcf_model.sample(
X_train,
Z_train,
y_train,
pi_train,
X_test,
Z_test,
pi_test,
num_gfr=10,
num_mcmc=100,
)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[4], line 2 1 bcf_model = BCFModel() ----> 2 bcf_model.sample( 3 X_train, 4 Z_train, 5 y_train, 6 pi_train, 7 X_test, 8 Z_test, 9 pi_test, 10 num_gfr=10, 11 num_mcmc=100, 12 ) File /opt/hostedtoolcache/Python/3.10.16/x64/lib/python3.10/site-packages/stochtree/bcf.py:381, in BCFModel.sample(self, X_train, Z_train, y_train, pi_train, rfx_group_ids_train, rfx_basis_train, X_test, Z_test, pi_test, rfx_group_ids_test, rfx_basis_test, num_gfr, num_burnin, num_mcmc, general_params, prognostic_forest_params, treatment_effect_forest_params, variance_forest_params) 379 raise ValueError("rfx_group_ids_train must be a numpy array") 380 if not np.issubdtype(rfx_group_ids_train.dtype, np.integer): --> 381 raise ValueError("rfx_group_ids_train must be a numpy array of integer-valued group IDs") 382 if rfx_basis_train is not None: 383 if not isinstance(rfx_basis_train, np.ndarray): ValueError: rfx_group_ids_train must be a numpy array of integer-valued group IDs
Inspect the MCMC (BART) samples
In [5]:
Copied!
forest_preds_y_mcmc = bcf_model.y_hat_test
y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)
y_df_mcmc = pd.DataFrame(
np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),
columns=["True outcome", "Average estimated outcome"],
)
sns.scatterplot(data=y_df_mcmc, x="Average estimated outcome", y="True outcome")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
forest_preds_y_mcmc = bcf_model.y_hat_test
y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True)
y_df_mcmc = pd.DataFrame(
np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1),
columns=["True outcome", "Average estimated outcome"],
)
sns.scatterplot(data=y_df_mcmc, x="Average estimated outcome", y="True outcome")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[5], line 1 ----> 1 forest_preds_y_mcmc = bcf_model.y_hat_test 2 y_avg_mcmc = np.squeeze(forest_preds_y_mcmc).mean(axis=1, keepdims=True) 3 y_df_mcmc = pd.DataFrame( 4 np.concatenate((np.expand_dims(y_test, 1), y_avg_mcmc), axis=1), 5 columns=["True outcome", "Average estimated outcome"], 6 ) AttributeError: 'BCFModel' object has no attribute 'y_hat_test'
In [6]:
Copied!
np.sqrt(np.mean(np.power(y_avg_mcmc - y_test, 2)))
np.sqrt(np.mean(np.power(y_avg_mcmc - y_test, 2)))
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[6], line 1 ----> 1 np.sqrt(np.mean(np.power(y_avg_mcmc - y_test, 2))) NameError: name 'y_avg_mcmc' is not defined
In [7]:
Copied!
treatment_idx = 0
forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx])
tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)
tau_df_mcmc = pd.DataFrame(
np.concatenate(
(np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1
),
columns=["True tau", "Average estimated tau"],
)
sns.scatterplot(data=tau_df_mcmc, x="True tau", y="Average estimated tau")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
treatment_idx = 0
forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx])
tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)
tau_df_mcmc = pd.DataFrame(
np.concatenate(
(np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1
),
columns=["True tau", "Average estimated tau"],
)
sns.scatterplot(data=tau_df_mcmc, x="True tau", y="Average estimated tau")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[7], line 2 1 treatment_idx = 0 ----> 2 forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx]) 3 tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True) 4 tau_df_mcmc = pd.DataFrame( 5 np.concatenate( 6 (np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1 7 ), 8 columns=["True tau", "Average estimated tau"], 9 ) AttributeError: 'BCFModel' object has no attribute 'tau_hat_test'
In [8]:
Copied!
treatment_idx = 1
forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx])
tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)
tau_df_mcmc = pd.DataFrame(
np.concatenate(
(np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1
),
columns=["True tau", "Average estimated tau"],
)
sns.scatterplot(data=tau_df_mcmc, x="True tau", y="Average estimated tau")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
treatment_idx = 1
forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx])
tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True)
tau_df_mcmc = pd.DataFrame(
np.concatenate(
(np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1
),
columns=["True tau", "Average estimated tau"],
)
sns.scatterplot(data=tau_df_mcmc, x="True tau", y="Average estimated tau")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[8], line 2 1 treatment_idx = 1 ----> 2 forest_preds_tau_mcmc = np.squeeze(bcf_model.tau_hat_test[:, :, treatment_idx]) 3 tau_avg_mcmc = np.squeeze(forest_preds_tau_mcmc).mean(axis=1, keepdims=True) 4 tau_df_mcmc = pd.DataFrame( 5 np.concatenate( 6 (np.expand_dims(tau_test[:, treatment_idx], 1), tau_avg_mcmc), axis=1 7 ), 8 columns=["True tau", "Average estimated tau"], 9 ) AttributeError: 'BCFModel' object has no attribute 'tau_hat_test'
In [9]:
Copied!
treatment_term_mcmc_test = np.multiply(
np.atleast_3d(Z_test).swapaxes(1, 2), bcf_model.tau_hat_test
).sum(axis=2)
treatment_term_test = np.multiply(tau_test, Z_test).sum(axis=1)
treatment_term_mcmc_avg = np.squeeze(treatment_term_mcmc_test).mean(
axis=1, keepdims=True
)
mu_df_mcmc = pd.DataFrame(
np.concatenate(
(np.expand_dims(treatment_term_test, 1), treatment_term_mcmc_avg), axis=1
),
columns=["True treatment term", "Average estimated treatment term"],
)
sns.scatterplot(
data=mu_df_mcmc, x="True treatment term", y="Average estimated treatment term"
)
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
treatment_term_mcmc_test = np.multiply(
np.atleast_3d(Z_test).swapaxes(1, 2), bcf_model.tau_hat_test
).sum(axis=2)
treatment_term_test = np.multiply(tau_test, Z_test).sum(axis=1)
treatment_term_mcmc_avg = np.squeeze(treatment_term_mcmc_test).mean(
axis=1, keepdims=True
)
mu_df_mcmc = pd.DataFrame(
np.concatenate(
(np.expand_dims(treatment_term_test, 1), treatment_term_mcmc_avg), axis=1
),
columns=["True treatment term", "Average estimated treatment term"],
)
sns.scatterplot(
data=mu_df_mcmc, x="True treatment term", y="Average estimated treatment term"
)
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[9], line 2 1 treatment_term_mcmc_test = np.multiply( ----> 2 np.atleast_3d(Z_test).swapaxes(1, 2), bcf_model.tau_hat_test 3 ).sum(axis=2) 4 treatment_term_test = np.multiply(tau_test, Z_test).sum(axis=1) 5 treatment_term_mcmc_avg = np.squeeze(treatment_term_mcmc_test).mean( 6 axis=1, keepdims=True 7 ) AttributeError: 'BCFModel' object has no attribute 'tau_hat_test'
In [10]:
Copied!
forest_preds_mu_mcmc = bcf_model.mu_hat_test
mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)
mu_df_mcmc = pd.DataFrame(
np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1),
columns=["True mu", "Average estimated mu"],
)
sns.scatterplot(data=mu_df_mcmc, x="True mu", y="Average estimated mu")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
forest_preds_mu_mcmc = bcf_model.mu_hat_test
mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True)
mu_df_mcmc = pd.DataFrame(
np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1),
columns=["True mu", "Average estimated mu"],
)
sns.scatterplot(data=mu_df_mcmc, x="True mu", y="Average estimated mu")
plt.axline((0, 0), slope=1, color="black", linestyle=(0, (3, 3)))
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[10], line 1 ----> 1 forest_preds_mu_mcmc = bcf_model.mu_hat_test 2 mu_avg_mcmc = np.squeeze(forest_preds_mu_mcmc).mean(axis=1, keepdims=True) 3 mu_df_mcmc = pd.DataFrame( 4 np.concatenate((np.expand_dims(mu_test, 1), mu_avg_mcmc), axis=1), 5 columns=["True mu", "Average estimated mu"], 6 ) AttributeError: 'BCFModel' object has no attribute 'mu_hat_test'
In [11]:
Copied!
sigma_df_mcmc = pd.DataFrame(
np.concatenate(
(
np.expand_dims(
np.arange(bcf_model.num_samples - bcf_model.num_gfr), axis=1
),
np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr :], axis=1),
),
axis=1,
),
columns=["Sample", "Sigma"],
)
sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma")
plt.show()
sigma_df_mcmc = pd.DataFrame(
np.concatenate(
(
np.expand_dims(
np.arange(bcf_model.num_samples - bcf_model.num_gfr), axis=1
),
np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr :], axis=1),
),
axis=1,
),
columns=["Sample", "Sigma"],
)
sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma")
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[11], line 5 1 sigma_df_mcmc = pd.DataFrame( 2 np.concatenate( 3 ( 4 np.expand_dims( ----> 5 np.arange(bcf_model.num_samples - bcf_model.num_gfr), axis=1 6 ), 7 np.expand_dims(bcf_model.global_var_samples[bcf_model.num_gfr :], axis=1), 8 ), 9 axis=1, 10 ), 11 columns=["Sample", "Sigma"], 12 ) 13 sns.scatterplot(data=sigma_df_mcmc, x="Sample", y="Sigma") 14 plt.show() AttributeError: 'BCFModel' object has no attribute 'num_samples'