import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
diabetes = pd.read_csv("files/pima-indians-diabetes.csv")
diabetes.head()
diabetes.columns
cols_to_normalize = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps',
'Insulin', 'BMI', 'Pedigree']
#When axis=0(default), applies function to each column
#When axis=1, applies function to each row
diabetes[cols_to_normalize] = diabetes[cols_to_normalize].apply(lambda x: (x-x.min())/(x.max()-x.min()),axis=0)
diabetes.head()
Age should be encoded because we want our model to treat all ages unbiased.
However, we do not want to use one hot encoding for all age numbers. So we just bin them
Bin the age
diabetes["Age"].hist(bins=20)
cols = diabetes.columns
arr = diabetes.values
age = arr[:,7]
age_categories = [10,20,25,30,35,40,45,50,60,70,80,90,100]
bins = [(0,10),(11,20),(21,25),(25,30),(31,35),(36,40),(41,45),(46,50),(51,60),(61,70),(71,80),(81,90),(91,100)]
for b in bins:
#cannot use 'and'. The format of logical and inside np.where is important
age = np.where((age>=b[0])&(age<=b[1]),b[1],age)
age[:20]
arr[:,7] = age
pd.DataFrame(arr,columns=cols).head()
Encode
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
See Refresher_sklearn for details
colT = ColumnTransformer([("1",OneHotEncoder(),[9]),("2",OneHotEncoder(categories=[age_categories]),[7])])
colT_fit = colT.fit(arr)
np.array(colT_fit.get_feature_names())
tr = colT_fit.transform(arr).toarray() #Returns just the transformed cols
print(tr.shape, arr.shape)
features = np.concatenate((arr[:,[0,1,2,3,4,5,6]],tr),axis=1) #specify all columns excetp 7 and 9
labels = arr[:,8]
print(features.shape)
print(labels.shape)
pd.DataFrame(features).head()
We need to find two dimensions that correspond to maximum variance
Step 1: Compute the mean of every column and subtract the mean from every row entry (to make mean=0)
Step 2: Compute Covariance matrix S = (1/768)xfeatures.Txfeatures (resultinf dim is 24x24)
Step 3: Compute eigen values(u) and eigen vecs(v) of S
Step 4: Pick the eigen vecs corresponding to top 2 eigen values
Step 5: transformed = features.dot(v[:2])
fd = pd.DataFrame(features)
fd = fd.sub(fd.mean(axis=0),axis=1)
fd.head()
fd_mat = fd.values.astype(np.float32)
sigma = np.matmul(fd_mat.T,fd_mat)*(1/768.)
sigma.shape
s,v = np.linalg.eig(sigma)
s.shape,v.shape
s #already sorted
v = v[:,:2]
transformed = fd_mat.dot(v)
transformed[:5]
plt.scatter(transformed[:,0],transformed[:,1],c=labels)
tf.reset_default_graph()
batch_size = 10
inp = tf.placeholder(tf.float32,(batch_size,24))
target = tf.placeholder(tf.float32,(batch_size,))
W1 = tf.Variable(tf.random_normal((24,20),dtype=tf.float32))
tf.add_to_collection('l2',W1)
B1 = tf.Variable(tf.ones((20),dtype=tf.float32))
H1 = tf.add(tf.matmul(inp,W1), B1) #(b,24)x(24,20) + (20,)
Z1 = tf.nn.relu(H1)
W2 = tf.Variable(tf.random_normal((20,10),dtype=tf.float32))
tf.add_to_collection('l2',W2)
B2 = tf.Variable(tf.ones((10),dtype=tf.float32))
H2 = tf.add(tf.matmul(Z1,W2), B2) #(b,20)x(20,10) + (10,)
Z2 = tf.nn.relu(H2)
W3 = tf.Variable(tf.random_normal((10,1),dtype=tf.float32))
tf.add_to_collection('l2',W3)
B3 = tf.Variable(1.0)
H3 = tf.add(tf.matmul(Z2,W3), B3) #(b,10)x(10,1) + (1,)
output = tf.sigmoid(H3)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
rand_int = np.random.randint(0,len(features),size=batch_size)
o = sess.run(output,feed_dict={inp:features[rand_int],target:labels[rand_int]})
print(o)
Binary cross entropy loss function and regularization:
Beware of log 0
Add L2 regularizer
delta=1e-8 #TO get rid of the log 0 problem
binary_cross_entropy = -(target*tf.math.log(output+delta) + ((1.-target)*tf.math.log(1.-output+delta)))
#Add L2 regularizer
l2_reg = 0.
for w in tf.get_collection('l2'):
l2_reg += tf.reduce_sum(tf.square(w))
l2_reg = 0.01*l2_reg
loss = tf.reduce_mean(binary_cross_entropy) + l2_reg
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
rand_int = np.random.randint(0,len(features),size=batch_size)
o = sess.run(loss,feed_dict={inp:features[rand_int],target:labels[rand_int]})
print(o)
Optimizer
optimizer = tf.train.GradientDescentOptimizer(0.01)
grads = optimizer.compute_gradients(loss)
grads
train = optimizer.apply_gradients(grads)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(10000):
rand_ind = np.random.randint(0,len(features),size=batch_size)
feed = {inp:features[rand_ind],target:labels[rand_ind]}
_,l = sess.run([train,loss],feed_dict=feed)
if i%1000 == 0:
print(l)