Re-posted from: https://tensorflowjulia.blogspot.com/2018/09/classifying-handwritten-digits-with.html
In this exercise, we look at the famous MNIST handwritten digit classification problem. Using the MNIST.jl package makes it easy to access the image samples from Julia. Similar to the logistic regression exercise, we use PyCall and scikit-learn‘s metrics for easy calculation of neural network accuracy and confusion matrices.
We will also visualize the first layer of the neural network to get an idea of how it “sees” handwritten digits. The part of the code that creates a 10×10 grid of plots is rather handwaving – if someone has an idea about how to properly set up programmatic generation and display of plots in Julia, I would be very interested.
The Jupyter notebook can be downloaded here.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
using Plots
using Distributions
gr()
using DataFrames
using TensorFlow
import CSV
import StatsBase
using PyCall
@pyimport sklearn.metrics as sklm
using Images
using Colors
sess=Session(Graph())
using MNIST
mutable struct DataLoader
cur_id::Int
order::Vector{Int}
end
DataLoader() = DataLoader(1, shuffle(1:60000))
loader=DataLoader()
function next_batch(loader::DataLoader, batch_size)
features = zeros(Float32, batch_size, 784)
labels = zeros(Float32, batch_size, 10)
for i in 1:batch_size
features[i, :] = trainfeatures(loader.order[loader.cur_id])./255.0
label = trainlabel(loader.order[loader.cur_id])
labels[i, Int(label)+1] = 1.0
loader.cur_id += 1
if loader.cur_id > 60000
loader.cur_id = 1
end
end
features, labels
end
function load_test_set(N=10000)
features = zeros(Float32, N, 784)
labels = zeros(Float32, N, 10)
for i in 1:N
features[i, :] = testfeatures(i)./255.0
label = testlabel(i)
labels[i, Int(label)+1] = 1.0
end
features,labels
end
trainfeatures(72)
trainlabel(72)
rand_number=rand(1:60000)
rand_example_features = trainfeatures(rand_number)
img=colorview(Gray,1.-reshape(rand_example_features, (28, 28)))
rand_example_label=trainlabel(rand_number)
println("Label: ",rand_example_label)
img
p1=heatmap(flipdim(1.-reshape(rand_example_features, (28, 28)),1), legend=:none, c=:gray, title="Label: $rand_example_label")
function preprocess_features(data_range)
examples = zeros(Float32, length(data_range), 784)
for i in 1:length(data_range)
examples[i, :] = testfeatures(i)./255.0
end
return examples
end
function preprocess_targets(data_range)
targets = zeros(Float32, length(data_range), 10)
for i in 1:length(data_range)
label = testlabel(i)
targets[i, Int(label)+1] = 1.0
end
return targets
end
training_examples = preprocess_features(1:7500)
training_targets = preprocess_targets(1:7500)
validation_examples=preprocess_features(7501:10000)
validation_targets=preprocess_targets(7501:10000);
function to1col(targets)
reduced_targets=zeros(size(targets,1),1)
for i=1:size(targets,1)
reduced_targets[i]=sum( collect(0:size(targets,2)-1).*targets[i,:])
end
return reduced_targets
end
function train_linear_classification_model(
learning_rate,
steps,
batch_size,
training_examples,
training_targets,
validation_examples,
validation_targets)
"""Trains a linear classification model for the MNIST digits dataset.
In addition to training, this function also prints training progress information,
a plot of the training and validation loss over time, and a confusion
matrix.
Args:
learning_rate: An `int`, the learning rate to use.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
training_examples: An `Array` containing the training features.
training_targets: An `Array` containing the training labels.
validation_examples: An `Array` containing the validation features.
validation_targets: An `Array` containing the validation labels.
Returns:
p1: Plot of loss metrics
p2: Plot of confusion matrix
"""
periods = 10
steps_per_period = steps / periods
# Create feature columns
feature_columns = placeholder(Float32)
target_columns = placeholder(Float32)
# Create network
W = Variable(zeros(Float32, 784, 10))
b = Variable(zeros(Float32, 10))
y = nn.softmax(feature_columns*W + b)
cross_entropy = reduce_mean(-reduce_sum(target_columns .* log(y), axis=[2]))
# Gradient decent with gradient clipping
my_optimizer=(train.AdamOptimizer(learning_rate))
gvs = train.compute_gradients(my_optimizer, cross_entropy)
capped_gvs = [(clip_by_norm(grad, 5.), var) for (grad, var) in gvs]
my_optimizer = train.apply_gradients(my_optimizer,capped_gvs)
run(sess, global_variables_initializer())
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
println("Training model...")
println("LogLoss error (on validation data):")
training_errors = []
validation_errors = []
for period in 1:periods
for i=1:steps_per_period
# Train the model, starting from the prior state.
features_batches, targets_batches = next_batch(loader, batch_size)
run(sess, my_optimizer, Dict(feature_columns=>features_batches, target_columns=>targets_batches))
end
# Take a break and compute probabilities.
training_predictions = run(sess, y, Dict(feature_columns=> training_examples, target_columns=>training_targets))
validation_predictions = run(sess, y, Dict(feature_columns=> validation_examples, target_columns=>validation_targets))
# Compute training and validation errors.
training_log_loss = sklm.log_loss(training_targets, training_predictions)
validation_log_loss = sklm.log_loss(validation_targets, validation_predictions)
# Occasionally print the current loss.
println(" period ", period, ": ",validation_log_loss)
# Add the loss metrics from this period to our list.
push!(training_errors, training_log_loss)
push!(validation_errors, validation_log_loss)
end
println("Model training finished.")
# Calculate final predictions (not probabilities, as above).
final_probabilities = run(sess, y, Dict(feature_columns=> validation_examples, target_columns=>validation_targets))
final_predictions=0.0.*copy(final_probabilities)
for i=1:size(final_predictions,1)
final_predictions[i,indmax(final_probabilities[i,:])]=1.0
end
accuracy = sklm.accuracy_score(validation_targets, final_predictions)
println("Final accuracy (on validation data): ", accuracy)
# Output a graph of loss metrics over periods.
p1=plot(training_errors, label="training", title="LogLoss vs. Periods", ylabel="LogLoss", xlabel="Periods")
p1=plot!(validation_errors, label="validation")
# Output a plot of the confusion matrix.
cm = sklm.confusion_matrix(to1col(validation_targets), to1col(final_predictions))
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class).
cm_normalized=convert.(Float32,copy(cm))
for i=1:size(cm,1)
cm_normalized[i,:]=cm[i,:]./sum(cm[i,:])
end
p2 = heatmap(cm_normalized, c=:dense, title="Confusion Matrix", ylabel="True label", xlabel= "Predicted label", xticks=(1:10, 0:9), yticks=(1:10, 0:9))
return p1, p2
end
p1, p2 = train_linear_classification_model(
0.02,#learning rate
100, #steps
10, #batch_size
training_examples,
training_targets,
validation_examples,
validation_targets)
plot(p1)
plot(p2)
sess=Session(Graph())
p1, p2 = train_linear_classification_model(
0.003,#learning rate
1000, #steps
30, #batch_size
training_examples,
training_targets,
validation_examples,
validation_targets)
plot(p1)
plot(p2)
function train_nn_classification_model(learning_rate,
steps,
batch_size,
hidden_units,
keep_probability,
training_examples,
training_targets,
validation_examples,
validation_targets)
"""Trains a NN classification model for the MNIST digits dataset.
In addition to training, this function also prints training progress information,
a plot of the training and validation loss over time, and a confusion
matrix.
Args:
learning_rate: An `int`, the learning rate to use.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
hidden_units: A vector describing the layout of the neural network.
keep_probability: A `float`, the probability of keeping a node active during one training step.
training_examples: An `Array` containing the training features.
training_targets: An `Array` containing the training labels.
validation_examples: An `Array` containing the validation features.
validation_targets: An `Array` containing the validation labels.
Returns:
p1: Plot of loss metrics
p2: Plot of confusion matrix
y: Prediction layer of the NN.
feature_columns: Feature column tensor of the NN.
target_columns: Target column tensor of the NN.
weight_export: Weights of the first layer of the NN.
"""
periods = 10
steps_per_period = steps / periods
# Create feature columns.
feature_columns = placeholder(Float32, shape=[-1, size(training_examples,2)])
target_columns = placeholder(Float32, shape=[-1, size(training_targets,2)])
# Network parameters
push!(hidden_units,size(training_targets,2)) #create an output node that fits to the size of the targets
activation_functions = Vector{Function}(size(hidden_units,1))
activation_functions[1:end-1]=z->nn.dropout(nn.relu(z), keep_probability)
activation_functions[end] = nn.softmax #Last function should be idenity as we need the logits
# create network
flag=0
weight_export=Variable([1])
Zs = [feature_columns]
for (ii,(hlsize, actfun)) in enumerate(zip(hidden_units, activation_functions))
Wii = get_variable("W_$ii"*randstring(4), [get_shape(Zs[end], 2), hlsize], Float32)
bii = get_variable("b_$ii"*randstring(4), [hlsize], Float32)
Zii = actfun(Zs[end]*Wii + bii)
push!(Zs, Zii)
if(flag==0)
weight_export=Wii
flag=1
end
end
y=Zs[end]
cross_entropy = reduce_mean(-reduce_sum(target_columns .* log(y), axis=[2]))
# Standard Adam Optimizer
my_optimizer=train.minimize(train.AdamOptimizer(learning_rate), cross_entropy)
run(sess, global_variables_initializer())
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
println("Training model...")
println("LogLoss error (on validation data):")
training_errors = []
validation_errors = []
for period in 1:periods
for i=1:steps_per_period
# Train the model, starting from the prior state.
features_batches, targets_batches = next_batch(loader, batch_size)
run(sess, my_optimizer, Dict(feature_columns=>features_batches, target_columns=>targets_batches))
end
# Take a break and compute probabilities.
training_predictions = run(sess, y, Dict(feature_columns=> training_examples, target_columns=>training_targets))
validation_predictions = run(sess, y, Dict(feature_columns=> validation_examples, target_columns=>validation_targets))
# Compute training and validation errors.
training_log_loss = sklm.log_loss(training_targets, training_predictions)
validation_log_loss = sklm.log_loss(validation_targets, validation_predictions)
# Occasionally print the current loss.
println(" period ", period, ": ",validation_log_loss)
# Add the loss metrics from this period to our list.
push!(training_errors, training_log_loss)
push!(validation_errors, validation_log_loss)
end
println("Model training finished.")
# Calculate final predictions (not probabilities, as above).
final_probabilities = run(sess, y, Dict(feature_columns=> validation_examples, target_columns=>validation_targets))
final_predictions=0.0.*copy(final_probabilities)
for i=1:size(final_predictions,1)
final_predictions[i,indmax(final_probabilities[i,:])]=1.0
end
accuracy = sklm.accuracy_score(validation_targets, final_predictions)
println("Final accuracy (on validation data): ", accuracy)
# Output a graph of loss metrics over periods.
p1=plot(training_errors, label="training", title="LogLoss vs. Periods", ylabel="LogLoss", xlabel="Periods")
p1=plot!(validation_errors, label="validation")
# Output a plot of the confusion matrix.
cm = sklm.confusion_matrix(to1col(validation_targets), to1col(final_predictions))
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class).
cm_normalized=convert.(Float32,copy(cm))
for i=1:size(cm,1)
cm_normalized[i,:]=cm[i,:]./sum(cm[i,:])
end
p2 = heatmap(cm_normalized, c=:dense, title="Confusion Matrix", ylabel="True label", xlabel= "Predicted label", xticks=(1:10, 0:9), yticks=(1:10, 0:9))
return p1, p2, y, feature_columns, target_columns, weight_export
end
sess=Session(Graph())
p1, p2, y, feature_columns, target_columns, weight_export = train_nn_classification_model(
# TWEAK THESE VALUES TO SEE HOW MUCH YOU CAN IMPROVE THE RMSE
0.003, #learning rate
1000, #steps
30, #batch_size
[100, 100], #hidden_units
1.0, # keep probability
training_examples,
training_targets,
validation_examples,
validation_targets)
plot(p1)
plot(p2)
test_examples = preprocess_features(10001:13000)
test_targets = preprocess_targets(10001:13000);
test_probabilities = run(sess, y, Dict(feature_columns=> test_examples, target_columns=>test_targets))
test_predictions=0.0.*copy(test_probabilities)
for i=1:size(test_predictions,1)
test_predictions[i,indmax(test_probabilities[i,:])]=1.0
end
accuracy = sklm.accuracy_score(test_targets, test_predictions)
println("Accuracy on test data: ", accuracy)
function string_as_varname_function(s::AbstractString, v::Any)
s = Symbol(s)
@eval (($s) = ($v))
end
weights0 = run(sess, weight_export)
num_nodes=size(weights0,2)
num_row=convert(Int,ceil(num_nodes/10))
for i=1:num_nodes
str_name=string("Heat",i)
string_as_varname_function(str_name, heatmap(reshape(weights0[:,i], (28,28)), c=:heat, legend=false, yticks=[], xticks=[] ) )
end
out_string="plot(Heat1"
for i=2:num_nodes-1
out_string=string(out_string, ", Heat", i)
end
out_string=string(out_string, ", Heat", num_nodes, ", layout=(num_row, 10), legend=false )")
eval(parse(out_string))
plot(Heat98)