Example neural networks used to model Alzheimer patient data with comparisons to more typical machine learning models.
%run -m ipy_startup
%matplotlib inline
# Plotly Initialization
import plotly as plty
import plotly.graph_objs as go
import cufflinks as cf
cf.set_config_file(offline=True, theme='white', offline_link_text=None, offline_show_link=False)
import tensorflow as tf
SEED = 1
# Load in the Alzheimer's dataset
d = pd.read_csv('~/Downloads/Alzheimers.csv')
# Drop this unnecessary column
d = d.drop('male', axis=1)
# Normalize gender values
d['gender'] = d['gender'].str.upper().str[0]
d['gender'].value_counts()
d.head()
def encode(df):
""" Add one-hot encoding for gender and genotype, as well as binarizing response """
d = df.copy()
d = pd.get_dummies(d, prefix='gender', prefix_sep=':', columns=['gender'])
d = pd.get_dummies(d, prefix='genotype', prefix_sep=':', columns=['Genotype'])
d['response'] = d['response'].map({'NotImpaired': 0, 'Impaired': 1})
return d
def get_index_split(df, n_train=266):
""" Generate train and test indexes """
idx = np.arange(len(df))
np.random.seed(SEED)
idx_train = np.random.choice(idx, size=n_train, replace=False)
idx_test = np.setdiff1d(idx, idx_train)
return idx_train, idx_test
# Get two sets of indexes we can use to split data for training and validation
idx_train, idx_test = get_index_split(d)
d_enc = encode(d)
# Now we have only numeric features
pd.set_option('display.max_info_columns', 1000)
d_enc.info()
d_enc.head()
d_enc.dtypes.value_counts()
d_enc.values.dtype
Split data into responses and features:
# Split training and test data into features and response
X_train, y_train = d_enc.iloc[idx_train].drop('response', axis=1), d_enc.iloc[idx_train]['response']
X_test, y_test = d_enc.iloc[idx_test].drop('response', axis=1), d_enc.iloc[idx_test]['response']
Define the models we'll compare some Tensorflow networks to later:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss, average_precision_score
# Define several common learning models
classifiers = {
'Gradient Boosted Trees': GradientBoostingClassifier(random_state=SEED),
'XGBoost': XGBClassifier(seed=SEED),
'Random Forest': RandomForestClassifier(random_state=SEED),
'Logistic Regression': Pipeline([
('scale', StandardScaler()),
('clf', LogisticRegression())
]),
'SVM': Pipeline([
('scale', StandardScaler()),
('clf', GridSearchCV(SVC(probability=True, kernel='linear', random_state=SEED), {'C': np.logspace(-6, 6, 15)}))
]),
'Neural Network': Pipeline([
('scale', StandardScaler()),
('clf', MLPClassifier(random_state=SEED))
]),
'Baseline': DummyClassifier(strategy='most_frequent')
}
Now define some functions that will determine learning model performance:
def get_scores(y_pred, y_proba):
eps = 1e-6
y_proba = y_proba.clip(eps, 1 - eps)
return {
'accuracy': accuracy_score(y_test, y_pred),
'logloss': log_loss(y_test, y_proba),
'prauc': average_precision_score(y_test, y_proba)
}
def get_classifier_scores(clf_name):
clf = classifiers[clf_name]
clf.fit(X_train, y_train)
y_proba = clf.predict_proba(X_test)[:,1]
y_pred = clf.predict(X_test)
scores = get_scores(y_pred, y_proba)
scores['name'] = clf_name
scores['model'] = clf
return scores
Finally, score all the different models and visualize their results:
scores = pd.DataFrame([get_classifier_scores(clf_name) for clf_name in classifiers]).set_index('name')
scores
scores['accuracy'].sort_values().iplot(kind='scatter', title='Accuracy')
scores['logloss'].sort_values(ascending=False).iplot(kind='scatter', title='LogLoss')
scores['prauc'].sort_values().iplot(kind='scatter', title='Precision-Recall AUC')
Take a look at the logistic regression coefficients:
coefs = pd.Series(scores.loc['Logistic Regression']['model'].named_steps['clf'].coef_[0], index=X_train.columns)
coefs.sort_values().iplot()
# For reference, take note of the frequency of the responses in the test and training data:
pd.concat([
y_test.value_counts(normalize=True).rename('test_pct'),
y_test.value_counts(normalize=False).rename('test_counts'),
y_train.value_counts(normalize=True).rename('train_pct'),
y_train.value_counts(normalize=False).rename('train_counts')
], axis=1)
We're going to build one of these here:
from IPython.display import Image
Image(url='media/network_architectures/Slide8.png', width=600)
import tensorflow as tf
from ml.tensorflow.utilities import tf_print
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
from tensorflow.python.ops import logging_ops
# Tensorflow logging levels can be set like this:
# tf.logging.set_verbosity(tf.logging.WARN)
def model_fn(X, y, mode, params):
tf.set_random_seed(SEED)
np.random.seed(SEED)
# Number of hidden neurons
h = params['num_hidden']
# Number of datapoints and features
n, p = X.shape.as_list()
# It's worth noting that you can interpret the "mode" by matching it to one of tf.contrib.learn.ModeKeys
# Define the first level weights and bias
b0 = tf.Variable(tf.random_normal([h], seed=SEED), name='b0')
w0 = tf.Variable(tf.random_normal([p, h], seed=SEED), name='w0')
# Define second level weights and bias
b1 = tf.Variable(tf.random_normal([1], seed=SEED), name='b1')
w1 = tf.Variable(tf.random_normal([h, 1], seed=SEED), name='w1')
# Get activations coming out of first layer (n, h)
z1 = tf.nn.relu(b0 + tf.matmul(X, w0))
# Get output value (n, 1)
z2 = b1 + tf.matmul(z1, w1)
tf.summary.histogram("z2", z2)
yp = 1 / (1 + tf.exp(-z2))
#yp = tf.nn.sigmoid(z2)
# yp is now 2-D tensor of shape n x 1 [technically (?, 1)], but since we don't need the column
# dimension of 1 and the "y" tensor is just 1-D, we'll remove the column dimension to make
# yp a 1-D tensor using the `squeeze` function
yp = tf.squeeze(yp, [1])
tf.summary.histogram("yp", yp)
#yp = tf_print(yp, lambda x: [x.min(), x.max()])
# Define and apply a regularizer to our weights
reg = tf.contrib.layers.l2_regularizer(params['alpha'])
reg_loss = tf.identity(reg(w0) + reg(w1), name='regloss')
# Compute log loss given 1D true labels and predictions
log_loss = tf.identity(tf.losses.log_loss(y, yp), name='logloss')
# Add regularization loss to log loss to create the overall loss we want to minimize
loss = tf.identity(log_loss + reg_loss, name='loss')
# Define optimization algorithm and parameters
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=params['learning_rate'],
optimizer="Adam"
)
#hook = tf.train.LoggingTensorHook([loss, p], every_n_iter=1)
return tf.contrib.learn.ModelFnOps(
mode, loss=loss, predictions={'predictions': yp},
train_op=train_op,
#training_hooks=[hook]
)
# Parameters we've used in our model (we can define and use whatever we want here)
model_params = {
'learning_rate':.1,
'alpha': .01,
'num_hidden': 100
}
# This is the directory in which tensorflow will store info about the model
model_dir = '/tmp/tf/model3'
!rm $model_dir/*
# These configurations are almost always key, they define how often summary data is
# stored as well as how many cores are used and how much GPU memory is used
# (on my mac, display wigs out if you run at 100% GPU memory usage)
model_config = tf.contrib.learn.RunConfig(
num_cores=1,
tf_random_seed=SEED,
save_summary_steps=10,
gpu_memory_fraction=.5
)
# Define the actual estimator using all of the above and add it to a pipeline like other scikit-learn models
est = tf.contrib.learn.Estimator(
model_fn=model_fn,
params=model_params,
model_dir=model_dir,
config=model_config
)
est = Pipeline([
('scale', StandardScaler()),
('est', SKCompat(est))
])
# Train the network
est = est.fit(X_train.values, y_train.values, est__max_steps=500, est__batch_size=500)
#est = est.fit(X_train.values.astype(np.float32), y_train.astype(np.int32), est__max_steps=500, est__batch_size=500)
# Flow:
# 1. show float32 vs float64 (point out logging levels and leave at info for now)
# 2. show NaN training error
# 3. show Tensorboard w/ nothing
# 4. show logging hook
# 5. show print hook w/ transformation (show p=0)
# 6. show fixing with clip_by_value on z2 (now printed p's are in reasonable range)
# 7. remove print p, change to histogram, show tensorboard
# 8. increase save_summary_steps
# - gpu_memory_fraction
# - tf_random_seed
# Reset for the above flow (use after making changes to restart)
# import tensorflow as tf
# from ml.tensorflow.utilities import tf_print
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
# from tensorflow.python.ops import logging_ops
# # Tensorflow logging levels can be set like this:
# # tf.logging.set_verbosity(tf.logging.WARN)
# def model_fn(X, y, mode, params):
# tf.set_random_seed(SEED)
# np.random.seed(SEED)
# # Number of hidden neurons
# h = params['num_hidden']
# # Number of datapoints and features
# n, p = X.shape.as_list()
# # It's worth noting that you can interpret the "mode" by matching it to one of tf.contrib.learn.ModeKeys
# # Define the first level weights and bias
# b0 = tf.Variable(tf.random_normal([h], seed=SEED), name='b0')
# w0 = tf.Variable(tf.random_normal([p, h], seed=SEED), name='w0')
# # Define second level weights and bias
# b1 = tf.Variable(tf.random_normal([1], seed=SEED), name='b1')
# w1 = tf.Variable(tf.random_normal([h, 1], seed=SEED), name='w1')
# # Get activations coming out of first layer (n, h)
# z1 = tf.nn.relu(b0 + tf.matmul(X, w0))
# # Get output value (n, 1)
# z2 = b1 + tf.matmul(z1, w1)
# #z2 = tf_print(z2, transform=lambda x: [x.min(), x.max()])
# #z2 = tf.clip_by_value(z2, -30, 30)
# tf.summary.histogram("z2", z2)
# yp = 1 / (1 + tf.exp(-z2))
# #yp = tf.nn.sigmoid(z2)
# # yp is now 2-D tensor of shape n x 1 [technically (?, 1)], but since we don't need the column
# # dimension of 1 and the "y" tensor is just 1-D, we'll remove the column dimension to make
# # yp a 1-D tensor using the `squeeze` function
# yp = tf.squeeze(yp, [1])
# tf.summary.histogram("yp", yp)
# #yp = tf_print(yp, transform=lambda x: 'yp = {}'.format([x.min(), x.max()]))
# # Define and apply a regularizer to our weights
# reg = tf.contrib.layers.l2_regularizer(params['alpha'])
# reg_loss = tf.identity(reg(w0) + reg(w1), name='regloss')
# # Compute log loss given 1D true labels and predictions
# log_loss = tf.identity(tf.losses.log_loss(y, yp), name='logloss')
# # Add regularization loss to log loss to create the overall loss we want to minimize
# loss = tf.identity(log_loss + reg_loss, name='loss')
# # Define optimization algorithm and parameters
# train_op = tf.contrib.layers.optimize_loss(
# loss=loss,
# global_step=tf.contrib.framework.get_global_step(),
# learning_rate=params['learning_rate'],
# optimizer="Adam"
# )
# #hook = tf.train.LoggingTensorHook([loss, p], every_n_iter=1)
# return tf.contrib.learn.ModelFnOps(
# mode, loss=loss, predictions={'predictions': yp},
# train_op=train_op,
# #training_hooks=[hook]
# )
# # Parameters we've used in our model (we can define and use whatever we want here)
# model_params = {
# 'learning_rate':.1,
# 'alpha': .01,
# 'num_hidden': 25
# }
# # This is the directory in which tensorflow will store info about the model
# model_dir = '/tmp/tf/model3'
# !rm $model_dir/*
# # These configurations are almost always key, they define how often summary data is
# # stored as well as how many cores are used and how much GPU memory is used
# # (on my mac, display wigs out if you run at 100% GPU memory usage)
# model_config = tf.contrib.learn.RunConfig(
# num_cores=1,
# tf_random_seed=SEED,
# save_summary_steps=10,
# gpu_memory_fraction=.5
# )
# # Define the actual estimator using all of the above and add it to a pipeline like other scikit-learn models
# est = tf.contrib.learn.Estimator(
# model_fn=model_fn,
# params=model_params,
# model_dir=model_dir,
# config=model_config
# )
# est = Pipeline([
# ('scale', StandardScaler()),
# ('est', SKCompat(est))
# ])
# # Train the network
# est = est.fit(X_train.values, y_train.values, est__max_steps=500, est__batch_size=500)
# #est = est.fit(X_train.values.astype(np.float32), y_train.astype(np.int32), est__max_steps=500, est__batch_size=500)
# # Flow:
# # 1. show float32 vs float64 (point out logging levels and leave at info for now)
# # 2. show NaN training error
# # 3. show Tensorboard w/ nothing
# # 4. show logging hook
# # 5. show print hook w/ transformation (show p=0)
# # 6. show fixing with clip_by_value on z2 (now printed p's are in reasonable range)
# # 7. remove print p, change to histogram, show tensorboard
# # 8. increase save_summary_steps
# # - gpu_memory_fraction
# # - tf_random_seed
Check Tensorboard for some training details ...
The TF estimator can be referenced like this:
est.named_steps['est']._estimator
X_test, y_test = d_enc.iloc[idx_test].drop('response', axis=1), d_enc.iloc[idx_test]['response']
y_proba = est.predict(X_test.values.astype(np.float32))['predictions']
y_pred = np.where(y_proba >= .5, 1, 0)
scores = get_scores(y_pred, y_proba)
scores
At an accuracy of ~80%, this is right on par with the sklearn MLP model
Now let's try building a model that combine the advantages of neural networks and linear approaches.
We want to build this:
from IPython.display import Image
Image(url='media/network_architectures/Slide9.png', width=800)
First, we'll have to be able to split up the features into the different input sets:
X_train.filter(regex='^gender|^genotype|^age').info()
# Generate a list of column indexes corresponding to features in each set
# - c_dem will be the indexes of the demographic features
# - c_arr for the protein array features
c_all = X_train.columns.tolist()
c_dem = X_train.filter(regex='^gender|^genotype|^age').columns
c_arr = X_train.columns.difference(c_dem)
c_dem = [c_all.index(c) for c in c_dem.tolist()]
c_arr = [c_all.index(c) for c in c_arr.tolist()]
X_train.iloc[:,c_dem].info()
X_train.shape
import tensorflow as tf
from ml.tensorflow.utilities import tf_print
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
from tensorflow.python.ops import logging_ops
SEED = 1
def model_fn(X, y, mode, params):
X_dem, X_arr = X
# Number of demographic and protein array features
p_dem = X_dem.shape.as_list()[1]
p_arr = X_arr.shape.as_list()[1]
##### Demographic Sub-Model ######
c0 = tf.Variable(tf.random_normal([1]), name='dem_bias')
c1 = tf.Variable(tf.random_normal([p_dem, 1]), name='dem_weight')
o_dem = tf.clip_by_value(tf.matmul(X_dem, c1) + c0, -30, 30, name='o_dem')
##### Protein Sub-Model ######
# Number of hidden neurons in protein array model
h = params['num_hidden']
# Standard one-layer NN specification
b0 = tf.Variable(tf.random_normal([h], seed=SEED), name='b0')
w0 = tf.Variable(tf.random_normal([p_arr, h], seed=SEED), name='w0')
b1 = tf.Variable(tf.random_normal([1], seed=SEED), name='b1')
w1 = tf.Variable(tf.random_normal([h, 1], seed=SEED), name='w1')
z1 = tf.nn.sigmoid(b0 + tf.matmul(X_arr, w0), name='z1')
o_arr = tf.clip_by_value(b1 + tf.matmul(z1, w1), -30, 30, name='o_arr')
##### Sub-Model Voting #####
# Pass the outputs on the logit scale from the two models through a sigmoid to get probabilities
p_dem = tf.nn.sigmoid(o_dem, name='p_dem')
p_arr = tf.nn.sigmoid(o_arr, name='p_arr')
# Create a "split" variable that will determine how much weight goes to each submodel's prediction
w_split = tf.Variable(0., name='w_split')
p_split = tf.nn.sigmoid(w_split)
# Set the overall prediction as the split weight times each of the sub-model predictions
yp = p_split * p_dem + (1. - p_split) * p_arr
yp = tf.squeeze(yp, [1])
##### Summarization #####
tf.summary.histogram("w0", w0)
tf.summary.histogram("c1", p_arr)
tf.summary.histogram("yp", yp)
tf.summary.histogram("p_dem", p_dem)
tf.summary.histogram("p_arr", p_dem)
tf.summary.scalar("w_split", w_split)
tf.summary.scalar("p_split", p_split)
##### Regularization #####
reg1 = tf.contrib.layers.l2_regularizer(params['alpha1'])
reg2 = tf.contrib.layers.l2_regularizer(params['alpha2'])
reg_loss = tf.identity(reg1(w0) + reg1(w1) + reg1(c1) + reg2(w_split), name='regloss')
##### Loss #####
log_loss = tf.identity(tf.losses.log_loss(y, yp), name='logloss')
##### Optimization #####
loss = tf.identity(log_loss + reg_loss, name='loss')
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=params['learning_rate'],
optimizer="Adam"
)
# Return predictions for the overall model as well as the sub-models
return tf.contrib.learn.ModelFnOps(
mode, loss=loss,
train_op=train_op,
predictions={
'overall_proba': yp,
'demo_proba': p_dem,
'protein_array_proba': p_arr
}
)
##### IMPORTANT: In order to be able to specify multiply input feature sets,
##### a feature engineering function needs to be specified like this
def feature_engineering_fn(X, y):
""" Split entire feature set into demographic and protein array sets """
X_dem = tf.cast(X[:, :len(c_dem)], tf.float32)
X_arr = tf.cast(X[:, len(c_dem):], tf.float32)
# Ensure that no features were lost somehow
assert X_dem.shape.as_list()[1] + X_arr.shape.as_list()[1] == X.shape.as_list()[1]
# What is returned here is exactly what will be given as X and y in model_fn
return (X_dem, X_arr), y
##### Model Parameters #####
model_params = {
'learning_rate':.01,
'alpha1': .001,
'alpha2': .05,
'num_hidden': 5
}
model_dir = '/tmp/tf/model4'
!rm $model_dir/*
model_config = tf.contrib.learn.RunConfig(
tf_random_seed=SEED,
save_summary_steps=10,
gpu_memory_fraction=.5
)
est = tf.contrib.learn.Estimator(
model_fn=model_fn,
params=model_params,
model_dir=model_dir,
config=model_config,
feature_engineering_fn=feature_engineering_fn
)
est = Pipeline([
('scale', StandardScaler()),
('est', SKCompat(est))
])
# This function will simply take the original features and reorder them so that the
# demographic features come first followed by the array features, which makes slicing
# them easier using tensorflow functions
def reorder(X):
return X.iloc[:, c_dem + c_arr]
# Train the model
est = est.fit(
reorder(X_train).values,
y_train.values.astype(np.int32),
est__max_steps=1500,
est__batch_size=500
)
-- Check Tensorboard --
Find the way the model chose to split the vote between the two models for each feature set:
x = est.named_steps['est']
var_names = x._estimator.get_variable_names()
w = x._estimator.get_variable_value('w_split')
p_split = 1 / (1 + np.exp(-w))
print('Demographic model weight = {}\nProtein model weight = {}'.format(p_split, 1-p_split))
Check the coefficients for the demographic factors:
dem_weight = x._estimator.get_variable_value('dem_weight')
dem_weight = pd.Series(dem_weight[:,0], index=X_train.columns[c_dem])
dem_weight.sort_values()
For reference: file://localhost/Users/eczech/repos/portfolio/demonstrative/R/meetups/r_tutorials/tutorial_03/alzheimers_ml/alzheimers_ml.nb.html
First, we can use the "Embedding Projector" in tensorboard to learn a little about the neural network that was trained.
Here is the main page for what the purpose of this part of Tensorboard is:
https://www.tensorflow.org/get_started/embedding_viz
And here is a live visualization of the projector for word embeddings:
To use the embedding projector for our data though, we have to do a little extra work to write "metadata" for the visualization:
# This function does nothing other than write a tsv file to the `model_dir`
# containing the names of each of the input features to the NN model
#
# You can read more on this metadata file at: https://www.tensorflow.org/get_started/embedding_viz
def write_embedding_metadata(X, idx):
cols = X.iloc[:,idx].columns.tolist()
m = pd.DataFrame({'Index': idx, 'Name': cols})
metapath = os.path.join(model_dir, 'metadata.tsv')
m.to_csv(metapath, index=False, sep='\t')
with open(os.path.join(model_dir, 'projector_config.pbtxt'), 'w') as fd:
fd.write("embeddings {{\ntensor_name: 'w0'\nmetadata_path: '{}'\n}}".format(metapath))
write_embedding_metadata(X_train, c_arr)
arr_weight = x._estimator.get_variable_value('w0')
#pd.Series(arr_weight[:,0], index=X_train.columns[c_arr]).sort_values()
arr_weight.shape
Another way to look at this is to examine what happens to the data as it gets "projected" onto some space by the first layer of NN weights. This should start to show separation according to some of the more important features.
To do this, you have to get that weight matrix, multiply it by the data, and then project the results into something 2D or 3D:
def get_metadata_df(d):
d_meta = d.copy()
c_arr_meta_lvl = []
c_arr_meta = ['tau', 'p_tau', 'VEGF', 'age']
for c in c_arr_meta:
cl = c + '_level'
d_meta[cl] = pd.qcut(d_meta[c], q=2, labels=['0-50%', '50-100%'])
c_arr_meta_lvl.append(cl)
d_meta = d_meta[['gender', 'response'] + c_arr_meta + c_arr_meta_lvl]
d_meta['person_id'] = d_meta.index.values
return d_meta
def get_embed_df(X, w):
return pd.DataFrame(np.matmul(X, w)).add_prefix('embed:')
def get_embed_meta_df(d, est, X, idx):
w = est.named_steps['est']._estimator.get_variable_value('w0')
d_embed = get_embed_df(X.iloc[:,c_arr].values, w)
d_meta = get_metadata_df(d).iloc[idx]
meta_index = d_meta.index
d_embed_meta = pd.concat([d_meta.reset_index(drop=True), d_embed.reset_index(drop=True)], axis=1)
d_embed_meta.index = meta_index.values
assert len(d_embed_meta) == len(d_meta) == len(d_embed)
return d_embed_meta
# Get the embedded data matrix along with some metadata for each of the data points
d_em = get_embed_meta_df(d, est, X_train, idx_train)
Now project the result above onto a 3D space:
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, SpectralEmbedding, TSNE
from sklearn.preprocessing import LabelEncoder
def plot_embedding(d_em, c_color, colorscale='spectral'):
m = d_em.filter(regex='^embed:').values
decomp = MDS(n_components=3)
m = decomp.fit_transform(m)
if d_em[c_color].dtype == np.object:
color = LabelEncoder().fit_transform(d_em[c_color])
else:
color = d_em[c_color]
trace = go.Scatter3d(
x=m[:, 0],
y=m[:, 1],
z=m[:, 2],
marker=dict(color=color, colorscale='Jet'),
mode='markers',
text=d_em[c_color]
)
fig = go.Figure(
data=[trace], layout=dict(
autosize=True, width=800, height=500,
margin=dict( l=0, r=0, b=0, t=0 ))
)
cf.iplot(fig)
plot_embedding(d_em, 'tau')
Now check to see how well this model performs compared to our baselines from before:
X_test, y_test = d_enc.iloc[idx_test].drop('response', axis=1), d_enc.iloc[idx_test]['response']
predictions = est.predict(reorder(X_test).values.astype(np.float32))
y_proba = predictions['overall_proba']
y_proba_dem = predictions['demo_proba']
y_proba_arr = predictions['protein_array_proba']
y_pred = np.where(y_proba >= .5, 1, 0)
scores = get_scores(y_pred, y_proba)
print('Hybrid model scores on test data:')
pd.Series(scores)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)
Lastly, look at the predictions from the submodels and see how they relate to one another:
# First, combine the probability predictions from the sub-models and overall model into a data
# frame that also contains the true label
d_pred = pd.DataFrame({
'pred_all': y_proba,
'pred_dem': y_proba_dem[:,0],
'pred_arr': y_proba_arr[:,0],
'true': y_test.values
}, index=['Person {}'.format(v) for v in y_test.index])
d_pred.head()
# Now visualize
traces = []
for k, g in d_pred.groupby('true'):
true_label = 'Impaired' if k > 0 else 'Not Impaired'
text = g.apply(
lambda r: 'Overall Prediction = {}<br>True Label = {}<br>{}'\
.format(round(float(r['pred_all']), 3), true_label, r.name), axis=1
)
traces.append(go.Scatter(
x=g['pred_dem'],
y=g['pred_arr'],
mode='markers',
marker=dict(size=(10 + 10*g['pred_all'].values).astype(np.int64)),
name='Impaired' if k > 0 else 'Not Impaired',
text=text
))
layout = dict(
title='Probability Predictions from Sub-Models on Test Data',
xaxis=dict(title='Prediction from Demographic Factors'),
yaxis=dict(title='Prediction from Protein Array Values'),
hovermode='closest'
)
fig = go.Figure(data=traces, layout=layout)
cf.iplot(fig)
y_test.value_counts()
dem_weight
person_id = 190
print('Demographic factors for person {}:'.format(person_id))
print(X_test.loc[person_id][X_train.columns[c_dem]])
X_train['age'].iplot(bins=30, title='Age Distribution (Training Data)', kind='histogram')
X_test.loc[person_id][X_train.columns[c_arr]][['tau', 'VEGF']]
X_test[['tau', 'VEGF']].describe()
In the end we got from this model something like demographic factors are 20% important and protein features are 80% important.
This was with just one split of the (small) dataset though and would likely change quite a bit for different parameters on the model or selections of train vs test data.
What would be much better then would be credible intervals or a distribution of some kind around that split percentage, like this:
p = np.random.normal(size=10000, loc=-1, scale=.5)
p = 1 / (1 + np.exp(-p))
pd.Series(p).iplot(kind='histogram', bins=100, title='Distribution for Demographic Factor Importance Percent')
The best way to do this with Tensorflow: Edward