Tensorflow Examples

Example neural networks used to model Alzheimer patient data with comparisons to more typical machine learning models.

%run -m ipy_startup
%matplotlib inline

# Plotly Initialization
import plotly as plty
import plotly.graph_objs as go
import cufflinks as cf
cf.set_config_file(offline=True, theme='white', offline_link_text=None, offline_show_link=False)

import tensorflow as tf

SEED = 1

Data Preparation

# Load in the Alzheimer's dataset
d = pd.read_csv('~/Downloads/Alzheimers.csv')

# Drop this unnecessary column
d = d.drop('male', axis=1)

# Normalize gender values
d['gender'] = d['gender'].str.upper().str[0]
d['gender'].value_counts()

d.head()

def encode(df):
    """ Add one-hot encoding for gender and genotype, as well as binarizing response """
    d = df.copy()
    d = pd.get_dummies(d, prefix='gender', prefix_sep=':', columns=['gender'])
    d = pd.get_dummies(d, prefix='genotype', prefix_sep=':', columns=['Genotype'])
    d['response'] = d['response'].map({'NotImpaired': 0, 'Impaired': 1})
    return d

def get_index_split(df, n_train=266):
    """ Generate train and test indexes """
    idx = np.arange(len(df))
    np.random.seed(SEED)
    idx_train = np.random.choice(idx, size=n_train, replace=False)
    idx_test = np.setdiff1d(idx, idx_train)
    return idx_train, idx_test

# Get two sets of indexes we can use to split data for training and validation
idx_train, idx_test = get_index_split(d)
d_enc = encode(d)

# Now we have only numeric features
pd.set_option('display.max_info_columns', 1000)
d_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 137 columns):
ACE_CD143_Angiotensin_Converti      333 non-null float64
ACTH_Adrenocorticotropic_Hormon     333 non-null float64
AXL                                 333 non-null float64
Adiponectin                         333 non-null float64
Alpha_1_Antichymotrypsin            333 non-null float64
Alpha_1_Antitrypsin                 333 non-null float64
Alpha_1_Microglobulin               333 non-null float64
Alpha_2_Macroglobulin               333 non-null float64
Angiopoietin_2_ANG_2                333 non-null float64
Angiotensinogen                     333 non-null float64
Apolipoprotein_A_IV                 333 non-null float64
Apolipoprotein_A1                   333 non-null float64
Apolipoprotein_A2                   333 non-null float64
Apolipoprotein_B                    333 non-null float64
Apolipoprotein_CI                   333 non-null float64
Apolipoprotein_CIII                 333 non-null float64
Apolipoprotein_D                    333 non-null float64
Apolipoprotein_E                    333 non-null float64
Apolipoprotein_H                    333 non-null float64
B_Lymphocyte_Chemoattractant_BL     333 non-null float64
BMP_6                               333 non-null float64
Beta_2_Microglobulin                333 non-null float64
Betacellulin                        333 non-null int64
C_Reactive_Protein                  333 non-null float64
CD40                                333 non-null float64
CD5L                                333 non-null float64
Calbindin                           333 non-null float64
Calcitonin                          333 non-null float64
CgA                                 333 non-null float64
Clusterin_Apo_J                     333 non-null float64
Complement_3                        333 non-null float64
Complement_Factor_H                 333 non-null float64
Connective_Tissue_Growth_Factor     333 non-null float64
Cortisol                            333 non-null float64
Creatine_Kinase_MB                  333 non-null float64
Cystatin_C                          333 non-null float64
EGF_R                               333 non-null float64
EN_RAGE                             333 non-null float64
ENA_78                              333 non-null float64
Eotaxin_3                           333 non-null int64
FAS                                 333 non-null float64
FSH_Follicle_Stimulation_Hormon     333 non-null float64
Fas_Ligand                          333 non-null float64
Fatty_Acid_Binding_Protein          333 non-null float64
Ferritin                            333 non-null float64
Fetuin_A                            333 non-null float64
Fibrinogen                          333 non-null float64
GRO_alpha                           333 non-null float64
Gamma_Interferon_induced_Monokin    333 non-null float64
Glutathione_S_Transferase_alpha     333 non-null float64
HB_EGF                              333 non-null float64
HCC_4                               333 non-null float64
Hepatocyte_Growth_Factor_HGF        333 non-null float64
I_309                               333 non-null float64
ICAM_1                              333 non-null float64
IGF_BP_2                            333 non-null float64
IL_11                               333 non-null float64
IL_13                               333 non-null float64
IL_16                               333 non-null float64
IL_17E                              333 non-null float64
IL_1alpha                           333 non-null float64
IL_3                                333 non-null float64
IL_4                                333 non-null float64
IL_5                                333 non-null float64
IL_6                                333 non-null float64
IL_6_Receptor                       333 non-null float64
IL_7                                333 non-null float64
IL_8                                333 non-null float64
IP_10_Inducible_Protein_10          333 non-null float64
IgA                                 333 non-null float64
Insulin                             333 non-null float64
Kidney_Injury_Molecule_1_KIM_1      333 non-null float64
LOX_1                               333 non-null float64
Leptin                              333 non-null float64
Lipoprotein_a                       333 non-null float64
MCP_1                               333 non-null float64
MCP_2                               333 non-null float64
MIF                                 333 non-null float64
MIP_1alpha                          333 non-null float64
MIP_1beta                           333 non-null float64
MMP_2                               333 non-null float64
MMP_3                               333 non-null float64
MMP10                               333 non-null float64
MMP7                                333 non-null float64
Myoglobin                           333 non-null float64
NT_proBNP                           333 non-null float64
NrCAM                               333 non-null float64
Osteopontin                         333 non-null float64
PAI_1                               333 non-null float64
PAPP_A                              333 non-null float64
PLGF                                333 non-null float64
PYY                                 333 non-null float64
Pancreatic_polypeptide              333 non-null float64
Prolactin                           333 non-null float64
Prostatic_Acid_Phosphatase          333 non-null float64
Protein_S                           333 non-null float64
Pulmonary_and_Activation_Regulat    333 non-null float64
RANTES                              333 non-null float64
Resistin                            333 non-null float64
S100b                               333 non-null float64
SGOT                                333 non-null float64
SHBG                                333 non-null float64
SOD                                 333 non-null float64
Serum_Amyloid_P                     333 non-null float64
Sortilin                            333 non-null float64
Stem_Cell_Factor                    333 non-null float64
TGF_alpha                           333 non-null float64
TIMP_1                              333 non-null float64
TNF_RII                             333 non-null float64
TRAIL_R3                            333 non-null float64
TTR_prealbumin                      333 non-null float64
Tamm_Horsfall_Protein_THP           333 non-null float64
Thrombomodulin                      333 non-null float64
Thrombopoietin                      333 non-null float64
Thymus_Expressed_Chemokine_TECK     333 non-null float64
Thyroid_Stimulating_Hormone         333 non-null float64
Thyroxine_Binding_Globulin          333 non-null float64
Tissue_Factor                       333 non-null float64
Transferrin                         333 non-null float64
Trefoil_Factor_3_TFF3               333 non-null float64
VCAM_1                              333 non-null float64
VEGF                                333 non-null float64
Vitronectin                         333 non-null float64
von_Willebrand_Factor               333 non-null float64
age                                 333 non-null float64
tau                                 333 non-null float64
p_tau                               333 non-null float64
Ab_42                               333 non-null float64
response                            333 non-null int64
gender:F                            333 non-null uint8
gender:M                            333 non-null uint8
genotype:E2E2                       333 non-null uint8
genotype:E2E3                       333 non-null uint8
genotype:E2E4                       333 non-null uint8
genotype:E3E3                       333 non-null uint8
genotype:E3E4                       333 non-null uint8
genotype:E4E4                       333 non-null uint8
dtypes: float64(126), int64(3), uint8(8)
memory usage: 338.3 KB

d_enc.head()

d_enc.dtypes.value_counts()

float64    126
uint8        8
int64        3
dtype: int64

d_enc.values.dtype

dtype('float64')

Baseline Models

Split data into responses and features:

# Split training and test data into features and response
X_train, y_train = d_enc.iloc[idx_train].drop('response', axis=1), d_enc.iloc[idx_train]['response']
X_test, y_test = d_enc.iloc[idx_test].drop('response', axis=1), d_enc.iloc[idx_test]['response']

Define the models we'll compare some Tensorflow networks to later:

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, log_loss, average_precision_score

# Define several common learning models
classifiers = {
    'Gradient Boosted Trees': GradientBoostingClassifier(random_state=SEED),
    'XGBoost': XGBClassifier(seed=SEED),
    'Random Forest': RandomForestClassifier(random_state=SEED),
    'Logistic Regression': Pipeline([
        ('scale', StandardScaler()), 
        ('clf', LogisticRegression())
    ]),
    'SVM': Pipeline([
        ('scale', StandardScaler()), 
        ('clf', GridSearchCV(SVC(probability=True, kernel='linear', random_state=SEED), {'C': np.logspace(-6, 6, 15)}))
    ]),
    'Neural Network': Pipeline([
        ('scale', StandardScaler()), 
        ('clf', MLPClassifier(random_state=SEED))
    ]),
    'Baseline': DummyClassifier(strategy='most_frequent')
}

Now define some functions that will determine learning model performance:

def get_scores(y_pred, y_proba):
    eps = 1e-6
    y_proba = y_proba.clip(eps, 1 - eps)
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'logloss': log_loss(y_test, y_proba),
        'prauc': average_precision_score(y_test, y_proba)
    }

def get_classifier_scores(clf_name):
    clf = classifiers[clf_name]
    clf.fit(X_train, y_train)
    y_proba = clf.predict_proba(X_test)[:,1]
    y_pred = clf.predict(X_test)
    
    scores = get_scores(y_pred, y_proba)
    scores['name'] = clf_name
    scores['model'] = clf
    return scores

Finally, score all the different models and visualize their results:

scores = pd.DataFrame([get_classifier_scores(clf_name) for clf_name in classifiers]).set_index('name')
scores

scores['accuracy'].sort_values().iplot(kind='scatter', title='Accuracy')

scores['logloss'].sort_values(ascending=False).iplot(kind='scatter', title='LogLoss')

scores['prauc'].sort_values().iplot(kind='scatter', title='Precision-Recall AUC')

Take a look at the logistic regression coefficients:

coefs = pd.Series(scores.loc['Logistic Regression']['model'].named_steps['clf'].coef_[0], index=X_train.columns)
coefs.sort_values().iplot()

# For reference, take note of the frequency of the responses in the test and training data:
pd.concat([
    y_test.value_counts(normalize=True).rename('test_pct'),
    y_test.value_counts(normalize=False).rename('test_counts'),
    y_train.value_counts(normalize=True).rename('train_pct'),
    y_train.value_counts(normalize=False).rename('train_counts')
], axis=1)

Tensorflow Single-Layer Network

We're going to build one of these here:

from IPython.display import Image
Image(url='media/network_architectures/Slide8.png', width=600)

import tensorflow as tf
from ml.tensorflow.utilities import tf_print
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
from tensorflow.python.ops import logging_ops

# Tensorflow logging levels can be set like this:
# tf.logging.set_verbosity(tf.logging.WARN)

def model_fn(X, y, mode, params):
    tf.set_random_seed(SEED)
    np.random.seed(SEED)
    
    # Number of hidden neurons
    h = params['num_hidden']
    
    # Number of datapoints and features
    n, p = X.shape.as_list()
    
    # It's worth noting that you can interpret the "mode" by matching it to one of tf.contrib.learn.ModeKeys
    
    # Define the first level weights and bias
    b0 = tf.Variable(tf.random_normal([h], seed=SEED), name='b0')
    w0 = tf.Variable(tf.random_normal([p, h], seed=SEED), name='w0')
    
    # Define second level weights and bias
    b1 = tf.Variable(tf.random_normal([1], seed=SEED), name='b1')
    w1 = tf.Variable(tf.random_normal([h, 1], seed=SEED), name='w1')
    
    # Get activations coming out of first layer (n, h)
    z1 = tf.nn.relu(b0 + tf.matmul(X, w0))
    
    # Get output value (n, 1)
    z2 = b1 + tf.matmul(z1, w1)
    tf.summary.histogram("z2", z2)
    
    yp = 1 / (1 + tf.exp(-z2))
    #yp = tf.nn.sigmoid(z2)
    
    # yp is now 2-D tensor of shape n x 1 [technically (?, 1)], but since we don't need the column
    # dimension of 1 and the "y" tensor is just 1-D, we'll remove the column dimension to make
    # yp a 1-D tensor using the `squeeze` function
    yp = tf.squeeze(yp, [1])
    tf.summary.histogram("yp", yp)
    #yp = tf_print(yp, lambda x: [x.min(), x.max()])
    
    # Define and apply a regularizer to our weights
    reg = tf.contrib.layers.l2_regularizer(params['alpha'])
    reg_loss = tf.identity(reg(w0) + reg(w1), name='regloss')
    
    # Compute log loss given 1D true labels and predictions
    log_loss = tf.identity(tf.losses.log_loss(y, yp), name='logloss')
    
    # Add regularization loss to log loss to create the overall loss we want to minimize
    loss = tf.identity(log_loss + reg_loss, name='loss')
    
    # Define optimization algorithm and parameters
    train_op = tf.contrib.layers.optimize_loss(
        loss=loss,
        global_step=tf.contrib.framework.get_global_step(),
        learning_rate=params['learning_rate'],
        optimizer="Adam"
    )
    
    #hook = tf.train.LoggingTensorHook([loss, p], every_n_iter=1)
    return tf.contrib.learn.ModelFnOps(
        mode, loss=loss, predictions={'predictions': yp}, 
        train_op=train_op,
        #training_hooks=[hook]
    )

# Parameters we've used in our model (we can define and use whatever we want here)
model_params = {
    'learning_rate':.1, 
    'alpha': .01,
    'num_hidden': 100
}

# This is the directory in which tensorflow will store info about the model
model_dir = '/tmp/tf/model3'
!rm $model_dir/*

# These configurations are almost always key, they define how often summary data is
# stored as well as how many cores are used and how much GPU memory is used 
# (on my mac, display wigs out if you run at 100% GPU memory usage)
model_config = tf.contrib.learn.RunConfig(
    num_cores=1,
    tf_random_seed=SEED,
    save_summary_steps=10,
    gpu_memory_fraction=.5
)

# Define the actual estimator using all of the above and add it to a pipeline like other scikit-learn models
est = tf.contrib.learn.Estimator(
    model_fn=model_fn, 
    params=model_params, 
    model_dir=model_dir,
    config=model_config
)
est = Pipeline([
    ('scale', StandardScaler()),
    ('est', SKCompat(est))
])


# Train the network
est = est.fit(X_train.values, y_train.values, est__max_steps=500, est__batch_size=500)
#est = est.fit(X_train.values.astype(np.float32), y_train.astype(np.int32), est__max_steps=500, est__batch_size=500)


# Flow:
# 1. show float32 vs float64 (point out logging levels and leave at info for now)
# 2. show NaN training error
# 3. show Tensorboard w/ nothing
# 4. show logging hook
# 5. show print hook w/ transformation (show p=0)
# 6. show fixing with clip_by_value on z2 (now printed p's are in reasonable range)
# 7. remove print p, change to histogram, show tensorboard
# 8. increase save_summary_steps
# - gpu_memory_fraction
# - tf_random_seed

# Reset for the above flow (use after making changes to restart)

# import tensorflow as tf
# from ml.tensorflow.utilities import tf_print
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler
# from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
# from tensorflow.python.ops import logging_ops

# # Tensorflow logging levels can be set like this:
# # tf.logging.set_verbosity(tf.logging.WARN)

# def model_fn(X, y, mode, params):
#     tf.set_random_seed(SEED)
#     np.random.seed(SEED)
    
#     # Number of hidden neurons
#     h = params['num_hidden']
    
#     # Number of datapoints and features
#     n, p = X.shape.as_list()
    
#     # It's worth noting that you can interpret the "mode" by matching it to one of tf.contrib.learn.ModeKeys
    
#     # Define the first level weights and bias
#     b0 = tf.Variable(tf.random_normal([h], seed=SEED), name='b0')
#     w0 = tf.Variable(tf.random_normal([p, h], seed=SEED), name='w0')
    
#     # Define second level weights and bias
#     b1 = tf.Variable(tf.random_normal([1], seed=SEED), name='b1')
#     w1 = tf.Variable(tf.random_normal([h, 1], seed=SEED), name='w1')
    
#     # Get activations coming out of first layer (n, h)
#     z1 = tf.nn.relu(b0 + tf.matmul(X, w0))
    
#     # Get output value (n, 1)
#     z2 = b1 + tf.matmul(z1, w1)
#     #z2 = tf_print(z2, transform=lambda x: [x.min(), x.max()])
#     #z2 = tf.clip_by_value(z2, -30, 30)
#     tf.summary.histogram("z2", z2)
    
#     yp = 1 / (1 + tf.exp(-z2))
#     #yp = tf.nn.sigmoid(z2)
    
#     # yp is now 2-D tensor of shape n x 1 [technically (?, 1)], but since we don't need the column
#     # dimension of 1 and the "y" tensor is just 1-D, we'll remove the column dimension to make
#     # yp a 1-D tensor using the `squeeze` function
#     yp = tf.squeeze(yp, [1])
#     tf.summary.histogram("yp", yp)
#     #yp = tf_print(yp, transform=lambda x: 'yp = {}'.format([x.min(), x.max()]))
    
#     # Define and apply a regularizer to our weights
#     reg = tf.contrib.layers.l2_regularizer(params['alpha'])
#     reg_loss = tf.identity(reg(w0) + reg(w1), name='regloss')
    
#     # Compute log loss given 1D true labels and predictions
#     log_loss = tf.identity(tf.losses.log_loss(y, yp), name='logloss')
    
#     # Add regularization loss to log loss to create the overall loss we want to minimize
#     loss = tf.identity(log_loss + reg_loss, name='loss')
    
#     # Define optimization algorithm and parameters
#     train_op = tf.contrib.layers.optimize_loss(
#         loss=loss,
#         global_step=tf.contrib.framework.get_global_step(),
#         learning_rate=params['learning_rate'],
#         optimizer="Adam"
#     )
    
#     #hook = tf.train.LoggingTensorHook([loss, p], every_n_iter=1)
#     return tf.contrib.learn.ModelFnOps(
#         mode, loss=loss, predictions={'predictions': yp}, 
#         train_op=train_op,
#         #training_hooks=[hook]
#     )

# # Parameters we've used in our model (we can define and use whatever we want here)
# model_params = {
#     'learning_rate':.1, 
#     'alpha': .01,
#     'num_hidden': 25
# }

# # This is the directory in which tensorflow will store info about the model
# model_dir = '/tmp/tf/model3'
# !rm $model_dir/*

# # These configurations are almost always key, they define how often summary data is
# # stored as well as how many cores are used and how much GPU memory is used 
# # (on my mac, display wigs out if you run at 100% GPU memory usage)
# model_config = tf.contrib.learn.RunConfig(
#     num_cores=1,
#     tf_random_seed=SEED,
#     save_summary_steps=10,
#     gpu_memory_fraction=.5
# )

# # Define the actual estimator using all of the above and add it to a pipeline like other scikit-learn models
# est = tf.contrib.learn.Estimator(
#     model_fn=model_fn, 
#     params=model_params, 
#     model_dir=model_dir,
#     config=model_config
# )
# est = Pipeline([
#     ('scale', StandardScaler()),
#     ('est', SKCompat(est))
# ])


# # Train the network
# est = est.fit(X_train.values, y_train.values, est__max_steps=500, est__batch_size=500)
# #est = est.fit(X_train.values.astype(np.float32), y_train.astype(np.int32), est__max_steps=500, est__batch_size=500)


# # Flow:
# # 1. show float32 vs float64 (point out logging levels and leave at info for now)
# # 2. show NaN training error
# # 3. show Tensorboard w/ nothing
# # 4. show logging hook
# # 5. show print hook w/ transformation (show p=0)
# # 6. show fixing with clip_by_value on z2 (now printed p's are in reasonable range)
# # 7. remove print p, change to histogram, show tensorboard
# # 8. increase save_summary_steps
# # - gpu_memory_fraction
# # - tf_random_seed

Check Tensorboard for some training details ...

The TF estimator can be referenced like this:

est.named_steps['est']._estimator

Evaluate Our Custom Network¶

X_test, y_test = d_enc.iloc[idx_test].drop('response', axis=1), d_enc.iloc[idx_test]['response']
y_proba = est.predict(X_test.values.astype(np.float32))['predictions']
y_pred = np.where(y_proba >= .5, 1, 0)

scores = get_scores(y_pred, y_proba)
scores

At an accuracy of ~80%, this is right on par with the sklearn MLP model

Tensorflow "Deep" Network

Now let's try building a model that combine the advantages of neural networks and linear approaches.

We want to build this:

from IPython.display import Image
Image(url='media/network_architectures/Slide9.png', width=800)

First, we'll have to be able to split up the features into the different input sets:

X_train.filter(regex='^gender|^genotype|^age').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 59 to 259
Data columns (total 9 columns):
age              266 non-null float64
gender:F         266 non-null uint8
gender:M         266 non-null uint8
genotype:E2E2    266 non-null uint8
genotype:E2E3    266 non-null uint8
genotype:E2E4    266 non-null uint8
genotype:E3E3    266 non-null uint8
genotype:E3E4    266 non-null uint8
genotype:E4E4    266 non-null uint8
dtypes: float64(1), uint8(8)
memory usage: 6.2 KB

# Generate a list of column indexes corresponding to features in each set
# - c_dem will be the indexes of the demographic features
# - c_arr for the protein array features
c_all = X_train.columns.tolist()
c_dem = X_train.filter(regex='^gender|^genotype|^age').columns
c_arr = X_train.columns.difference(c_dem)
c_dem = [c_all.index(c) for c in c_dem.tolist()]
c_arr = [c_all.index(c) for c in c_arr.tolist()]

X_train.iloc[:,c_dem].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 59 to 259
Data columns (total 9 columns):
age              266 non-null float64
gender:F         266 non-null uint8
gender:M         266 non-null uint8
genotype:E2E2    266 non-null uint8
genotype:E2E3    266 non-null uint8
genotype:E2E4    266 non-null uint8
genotype:E3E3    266 non-null uint8
genotype:E3E4    266 non-null uint8
genotype:E4E4    266 non-null uint8
dtypes: float64(1), uint8(8)
memory usage: 6.2 KB

X_train.shape

(266, 136)

import tensorflow as tf
from ml.tensorflow.utilities import tf_print
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.contrib.learn.python.learn.estimators.estimator import SKCompat
from tensorflow.python.ops import logging_ops
SEED = 1

def model_fn(X, y, mode, params):
    X_dem, X_arr = X
    
    # Number of demographic and protein array features
    p_dem = X_dem.shape.as_list()[1]
    p_arr = X_arr.shape.as_list()[1]
    
    
    ##### Demographic Sub-Model ######
    c0 = tf.Variable(tf.random_normal([1]), name='dem_bias')
    c1 = tf.Variable(tf.random_normal([p_dem, 1]), name='dem_weight')
    o_dem = tf.clip_by_value(tf.matmul(X_dem, c1) + c0, -30, 30, name='o_dem')
    
    
    ##### Protein Sub-Model ######
    # Number of hidden neurons in protein array model
    h = params['num_hidden']
    # Standard one-layer NN specification
    b0 = tf.Variable(tf.random_normal([h], seed=SEED), name='b0')
    w0 = tf.Variable(tf.random_normal([p_arr, h], seed=SEED), name='w0')
    b1 = tf.Variable(tf.random_normal([1], seed=SEED), name='b1')
    w1 = tf.Variable(tf.random_normal([h, 1], seed=SEED), name='w1')
    z1 = tf.nn.sigmoid(b0 + tf.matmul(X_arr, w0), name='z1')
    o_arr = tf.clip_by_value(b1 + tf.matmul(z1, w1), -30, 30, name='o_arr')
    
    
    ##### Sub-Model Voting #####
    
    # Pass the outputs on the logit scale from the two models through a sigmoid to get probabilities
    p_dem = tf.nn.sigmoid(o_dem, name='p_dem')
    p_arr = tf.nn.sigmoid(o_arr, name='p_arr')
    
    # Create a "split" variable that will determine how much weight goes to each submodel's prediction
    w_split = tf.Variable(0., name='w_split')
    p_split = tf.nn.sigmoid(w_split)
    
    # Set the overall prediction as the split weight times each of the sub-model predictions
    yp = p_split * p_dem + (1. - p_split) * p_arr
    yp = tf.squeeze(yp, [1])
    
    ##### Summarization #####
    tf.summary.histogram("w0", w0)
    tf.summary.histogram("c1", p_arr)
    tf.summary.histogram("yp", yp)
    tf.summary.histogram("p_dem", p_dem)
    tf.summary.histogram("p_arr", p_dem)
    tf.summary.scalar("w_split", w_split)
    tf.summary.scalar("p_split", p_split)

    ##### Regularization #####
    reg1 = tf.contrib.layers.l2_regularizer(params['alpha1'])
    reg2 = tf.contrib.layers.l2_regularizer(params['alpha2'])
    reg_loss = tf.identity(reg1(w0) + reg1(w1) + reg1(c1) + reg2(w_split), name='regloss')
    
    
    ##### Loss #####
    log_loss = tf.identity(tf.losses.log_loss(y, yp), name='logloss')
    
    ##### Optimization #####
    loss = tf.identity(log_loss + reg_loss, name='loss')
    train_op = tf.contrib.layers.optimize_loss(
        loss=loss,
        global_step=tf.contrib.framework.get_global_step(),
        learning_rate=params['learning_rate'],
        optimizer="Adam"
    )
    
    # Return predictions for the overall model as well as the sub-models
    return tf.contrib.learn.ModelFnOps(
        mode, loss=loss,
        train_op=train_op,
        predictions={
            'overall_proba': yp, 
            'demo_proba': p_dem, 
            'protein_array_proba': p_arr
        }
    )


##### IMPORTANT: In order to be able to specify multiply input feature sets,
##### a feature engineering function needs to be specified like this
def feature_engineering_fn(X, y):
    """ Split entire feature set into demographic and protein array sets """
    X_dem = tf.cast(X[:, :len(c_dem)], tf.float32)
    X_arr = tf.cast(X[:, len(c_dem):], tf.float32)
    # Ensure that no features were lost somehow
    assert X_dem.shape.as_list()[1] + X_arr.shape.as_list()[1] == X.shape.as_list()[1]
    
    # What is returned here is exactly what will be given as X and y in model_fn
    return (X_dem, X_arr), y


##### Model Parameters #####

model_params = {
    'learning_rate':.01, 
    'alpha1': .001,
    'alpha2': .05,
    'num_hidden': 5
}

model_dir = '/tmp/tf/model4'
!rm $model_dir/*

model_config = tf.contrib.learn.RunConfig(
    tf_random_seed=SEED,
    save_summary_steps=10,
    gpu_memory_fraction=.5
)
    
est = tf.contrib.learn.Estimator(
    model_fn=model_fn, 
    params=model_params, 
    model_dir=model_dir,
    config=model_config,
    feature_engineering_fn=feature_engineering_fn
)
est = Pipeline([
    ('scale', StandardScaler()),
    ('est', SKCompat(est))
])


# This function will simply take the original features and reorder them so that the 
# demographic features come first followed by the array features, which makes slicing
# them easier using tensorflow functions
def reorder(X):
    return X.iloc[:, c_dem + c_arr]

# Train the model
est = est.fit(
    reorder(X_train).values, 
    y_train.values.astype(np.int32), 
    est__max_steps=1500, 
    est__batch_size=500
)

rm: /tmp/tf/model4/*: No such file or directory
INFO:tensorflow:Using config: {'_task_type': None, '_environment': 'local', '_save_summary_steps': 10, '_master': '', '_is_chief': True, '_save_checkpoints_steps': None, '_num_ps_replicas': 0, '_tf_random_seed': 1, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 0.5
}
, '_task_id': 0, '_keep_checkpoint_max': 5, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x13c838940>, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_evaluation_master': ''}
WARNING:tensorflow:float64 is not supported by many models, consider casting to float32.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tf/model4/model.ckpt.
INFO:tensorflow:loss = 1.75721, step = 1
INFO:tensorflow:global_step/sec: 186.77
INFO:tensorflow:loss = 0.542455, step = 101
INFO:tensorflow:global_step/sec: 195.776
INFO:tensorflow:loss = 0.311369, step = 201
INFO:tensorflow:global_step/sec: 190.967
INFO:tensorflow:loss = 0.250869, step = 301
INFO:tensorflow:global_step/sec: 196.714
INFO:tensorflow:loss = 0.228333, step = 401
INFO:tensorflow:global_step/sec: 188.368
INFO:tensorflow:loss = 0.213046, step = 501
INFO:tensorflow:global_step/sec: 287.1
INFO:tensorflow:loss = 0.204767, step = 601
INFO:tensorflow:global_step/sec: 224.755
INFO:tensorflow:loss = 0.198163, step = 701
INFO:tensorflow:global_step/sec: 227.791
INFO:tensorflow:loss = 0.186599, step = 801
INFO:tensorflow:global_step/sec: 288.684
INFO:tensorflow:loss = 0.176622, step = 901
INFO:tensorflow:global_step/sec: 209.616
INFO:tensorflow:loss = 0.171411, step = 1001
INFO:tensorflow:global_step/sec: 252.787
INFO:tensorflow:loss = 0.168296, step = 1101
INFO:tensorflow:global_step/sec: 230.906
INFO:tensorflow:loss = 0.166181, step = 1201
INFO:tensorflow:global_step/sec: 234.23
INFO:tensorflow:loss = 0.162818, step = 1301
INFO:tensorflow:global_step/sec: 200.359
INFO:tensorflow:loss = 0.161578, step = 1401
INFO:tensorflow:Saving checkpoints for 1500 into /tmp/tf/model4/model.ckpt.
INFO:tensorflow:Loss for final step: 0.160721.

-- Check Tensorboard --

Demographic Model Analysis¶

Find the way the model chose to split the vote between the two models for each feature set:

x = est.named_steps['est']
var_names = x._estimator.get_variable_names()

w = x._estimator.get_variable_value('w_split')
p_split = 1 / (1 + np.exp(-w))

print('Demographic model weight = {}\nProtein model weight = {}'.format(p_split, 1-p_split))

Demographic model weight = 0.238671385272596
Protein model weight = 0.761328614727404

Check the coefficients for the demographic factors:

dem_weight = x._estimator.get_variable_value('dem_weight')
dem_weight = pd.Series(dem_weight[:,0], index=X_train.columns[c_dem])
dem_weight.sort_values()

genotype:E3E3   -0.481458
genotype:E2E3   -0.305061
gender:F        -0.273824
genotype:E2E2   -0.222980
genotype:E2E4   -0.198594
gender:M         0.200094
genotype:E3E4    0.575775
age              0.876573
genotype:E4E4    0.973353
dtype: float32

For reference: file://localhost/Users/eczech/repos/portfolio/demonstrative/R/meetups/r_tutorials/tutorial_03/alzheimers_ml/alzheimers_ml.nb.html

Protein Array Model Analysis¶

Embedding Analysis¶

First, we can use the "Embedding Projector" in tensorboard to learn a little about the neural network that was trained.

Here is the main page for what the purpose of this part of Tensorboard is:

https://www.tensorflow.org/get_started/embedding_viz

And here is a live visualization of the projector for word embeddings:

https://projector.tensorflow.org/

To use the embedding projector for our data though, we have to do a little extra work to write "metadata" for the visualization:

# This function does nothing other than write a tsv file to the `model_dir` 
# containing the names of each of the input features to the NN model
# 
# You can read more on this metadata file at: https://www.tensorflow.org/get_started/embedding_viz
def write_embedding_metadata(X, idx):
    cols = X.iloc[:,idx].columns.tolist()
    m = pd.DataFrame({'Index': idx, 'Name': cols})
    metapath = os.path.join(model_dir, 'metadata.tsv')
    m.to_csv(metapath, index=False, sep='\t')
    with open(os.path.join(model_dir, 'projector_config.pbtxt'), 'w') as fd:
        fd.write("embeddings {{\ntensor_name: 'w0'\nmetadata_path: '{}'\n}}".format(metapath))

write_embedding_metadata(X_train, c_arr)

arr_weight = x._estimator.get_variable_value('w0')
#pd.Series(arr_weight[:,0], index=X_train.columns[c_arr]).sort_values()
arr_weight.shape

(127, 5)

NN Data Projection Analysis¶

Another way to look at this is to examine what happens to the data as it gets "projected" onto some space by the first layer of NN weights. This should start to show separation according to some of the more important features.

To do this, you have to get that weight matrix, multiply it by the data, and then project the results into something 2D or 3D:

def get_metadata_df(d):
    d_meta = d.copy()

    c_arr_meta_lvl = []
    c_arr_meta = ['tau', 'p_tau', 'VEGF', 'age']
    for c in c_arr_meta:
        cl = c + '_level'
        d_meta[cl] = pd.qcut(d_meta[c], q=2, labels=['0-50%', '50-100%'])
        c_arr_meta_lvl.append(cl)

    d_meta = d_meta[['gender', 'response'] + c_arr_meta + c_arr_meta_lvl]
    d_meta['person_id'] = d_meta.index.values
    return d_meta

def get_embed_df(X, w):
    return pd.DataFrame(np.matmul(X, w)).add_prefix('embed:')

def get_embed_meta_df(d, est, X, idx):
    w = est.named_steps['est']._estimator.get_variable_value('w0')
    d_embed = get_embed_df(X.iloc[:,c_arr].values, w)
    d_meta = get_metadata_df(d).iloc[idx]
    meta_index = d_meta.index
    d_embed_meta = pd.concat([d_meta.reset_index(drop=True), d_embed.reset_index(drop=True)], axis=1)
    d_embed_meta.index = meta_index.values
    assert len(d_embed_meta) == len(d_meta) == len(d_embed)
    return d_embed_meta

# Get the embedded data matrix along with some metadata for each of the data points
d_em = get_embed_meta_df(d, est, X_train, idx_train)

Now project the result above onto a 3D space:

from sklearn.decomposition import PCA
from sklearn.manifold import MDS, SpectralEmbedding, TSNE
from sklearn.preprocessing import LabelEncoder

def plot_embedding(d_em, c_color, colorscale='spectral'):
    m = d_em.filter(regex='^embed:').values
    decomp = MDS(n_components=3)
    m = decomp.fit_transform(m)
    
    if d_em[c_color].dtype == np.object:
        color = LabelEncoder().fit_transform(d_em[c_color])
    else:
        color = d_em[c_color]
    
    trace = go.Scatter3d(
        x=m[:, 0],
        y=m[:, 1],
        z=m[:, 2],
        marker=dict(color=color, colorscale='Jet'),
        mode='markers',
        text=d_em[c_color]
    )
    fig = go.Figure(
        data=[trace], layout=dict(
        autosize=True, width=800, height=500, 
        margin=dict( l=0, r=0, b=0, t=0 ))
    )
    cf.iplot(fig)
    
plot_embedding(d_em, 'tau')

Performance¶

Now check to see how well this model performs compared to our baselines from before:

X_test, y_test = d_enc.iloc[idx_test].drop('response', axis=1), d_enc.iloc[idx_test]['response']

predictions = est.predict(reorder(X_test).values.astype(np.float32))
y_proba = predictions['overall_proba']
y_proba_dem = predictions['demo_proba']
y_proba_arr = predictions['protein_array_proba']

y_pred = np.where(y_proba >= .5, 1, 0)

scores = get_scores(y_pred, y_proba)
print('Hybrid model scores on test data:')
pd.Series(scores)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

Predictions¶

Lastly, look at the predictions from the submodels and see how they relate to one another:

# First, combine the probability predictions from the sub-models and overall model into a data
# frame that also contains the true label
d_pred = pd.DataFrame({
    'pred_all': y_proba, 
    'pred_dem': y_proba_dem[:,0], 
    'pred_arr': y_proba_arr[:,0],
    'true': y_test.values
}, index=['Person {}'.format(v) for v in y_test.index])
d_pred.head()

# Now visualize 
traces = []
for k, g in d_pred.groupby('true'):
    true_label = 'Impaired' if k > 0 else 'Not Impaired'
    text = g.apply(
        lambda r: 'Overall Prediction = {}<br>True Label = {}<br>{}'\
                    .format(round(float(r['pred_all']), 3), true_label, r.name), axis=1
    )
    traces.append(go.Scatter(
        x=g['pred_dem'],
        y=g['pred_arr'],
        mode='markers',
        marker=dict(size=(10 + 10*g['pred_all'].values).astype(np.int64)),
        name='Impaired' if k > 0 else 'Not Impaired',
        text=text
    ))
    
layout = dict(
    title='Probability Predictions from Sub-Models on Test Data',
    xaxis=dict(title='Prediction from Demographic Factors'),
    yaxis=dict(title='Prediction from Protein Array Values'),
    hovermode='closest'
)
fig = go.Figure(data=traces, layout=layout)
cf.iplot(fig)

y_test.value_counts()

dem_weight

person_id = 190
print('Demographic factors for person {}:'.format(person_id))
print(X_test.loc[person_id][X_train.columns[c_dem]])
X_train['age'].iplot(bins=30, title='Age Distribution (Training Data)', kind='histogram')

X_test.loc[person_id][X_train.columns[c_arr]][['tau', 'VEGF']]

X_test[['tau', 'VEGF']].describe()

Last Thought¶

In the end we got from this model something like demographic factors are 20% important and protein features are 80% important.

This was with just one split of the (small) dataset though and would likely change quite a bit for different parameters on the model or selections of train vs test data.

What would be much better then would be credible intervals or a distribution of some kind around that split percentage, like this:

p = np.random.normal(size=10000, loc=-1, scale=.5)
p = 1 / (1 + np.exp(-p))
pd.Series(p).iplot(kind='histogram', bins=100, title='Distribution for Demographic Factor Importance Percent')

Edward¶

The best way to do this with Tensorflow: Edward

	ACE_CD143_Angiotensin_Converti	ACTH_Adrenocorticotropic_Hormon	AXL	Adiponectin	Alpha_1_Antichymotrypsin	Alpha_1_Antitrypsin	Alpha_1_Microglobulin	Alpha_2_Macroglobulin	Angiopoietin_2_ANG_2	Angiotensinogen	...	VEGF	Vitronectin	von_Willebrand_Factor	age	tau	p_tau	Ab_42	Genotype	response	gender
0	2.003100	-1.386294	1.098387	-5.360193	1.740466	-12.631361	-2.577022	-72.650290	1.064711	2.510547	...	22.034564	-0.040822	-3.146555	88.520569	6.297754	4.348108	12.019678	E3E3	NotImpaired	F
1	1.561856	-1.386294	0.683282	-5.020686	1.458615	-11.909882	-3.244194	-154.612278	0.741937	2.457283	...	18.601843	-0.385662	-3.863233	80.333313	6.659294	4.859967	11.015759	E3E4	NotImpaired	F
2	1.520660	-1.714798	-0.145276	-5.809143	1.193922	-13.642963	-2.882404	-136.529178	0.832909	1.976365	...	17.476191	-0.223144	-3.540459	83.205072	6.270988	4.400247	12.302271	E3E4	NotImpaired	M
3	1.680826	-1.609438	0.683282	-5.115996	1.280934	-15.523564	-3.170086	-98.361752	0.916291	2.376085	...	17.545595	-0.653926	-3.863233	83.402015	6.152733	4.494886	12.398138	E3E4	NotImpaired	F
4	2.400931	-0.967584	0.190890	-4.779524	2.128232	-11.133063	-2.343407	-144.944601	0.955511	2.862219	...	20.778602	0.166216	-3.816713	85.961761	6.623707	4.524589	11.024109	E3E3	NotImpaired	F

	ACE_CD143_Angiotensin_Converti	ACTH_Adrenocorticotropic_Hormon	AXL	Adiponectin	Alpha_1_Antichymotrypsin	Alpha_1_Antitrypsin	Alpha_1_Microglobulin	Alpha_2_Macroglobulin	Angiopoietin_2_ANG_2	Angiotensinogen	...	Ab_42	gender:F	gender:M	genotype:E3E3	genotype:E3E4
0	2.003100	-1.386294	1.098387	-5.360193	1.740466	-12.631361	-2.577022	-72.650290	1.064711	2.510547	...	12.019678	1	0	1	0
1	1.561856	-1.386294	0.683282	-5.020686	1.458615	-11.909882	-3.244194	-154.612278	0.741937	2.457283	...	11.015759	1	0	0	1
2	1.520660	-1.714798	-0.145276	-5.809143	1.193922	-13.642963	-2.882404	-136.529178	0.832909	1.976365	...	12.302271	0	1	0	1
3	1.680826	-1.609438	0.683282	-5.115996	1.280934	-15.523564	-3.170086	-98.361752	0.916291	2.376085	...	12.398138	1	0	0	1
4	2.400931	-0.967584	0.190890	-4.779524	2.128232	-11.133063	-2.343407	-144.944601	0.955511	2.862219	...	11.024109	1	0	1	0

	accuracy	logloss	model	prauc
name
XGBoost	0.835821	0.389171	XGBClassifier(base_score=0.5, colsample_byleve...	0.714139
Baseline	0.701493	4.124034	DummyClassifier(constant=None, random_state=No...	0.649254
Logistic Regression	0.850746	0.305154	Pipeline(steps=[('scale', StandardScaler(copy=...	0.858263
Neural Network	0.820896	0.421111	Pipeline(steps=[('scale', StandardScaler(copy=...	0.790854
Gradient Boosted Trees	0.820896	0.406271	([DecisionTreeRegressor(criterion='friedman_ms...	0.696290
Random Forest	0.746269	0.531560	(DecisionTreeClassifier(class_weight=None, cri...	0.506581
SVM	0.835821	0.321293	Pipeline(steps=[('scale', StandardScaler(copy=...	0.816315

	test_pct	test_counts	train_pct	train_counts
0	0.701493	47	0.733083	195
1	0.298507	20	0.266917	71