Re-posted from: https://tensorflowjulia.blogspot.com/2018/08/validation-of-linear-regressor-model.html
The third part of the Machine Learning Crash Course deals with validation of the model.
The Jupyter notebook can be downloaded here. For the version displayed below, I needed to remove some scatter plots, which are contained in the original file.
In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
In [14]:
# Load packages
using Plots
gr()
using DataFrames
using TensorFlow
import CSV
# Start a TensorFlow session and load the data
sess=Session()
california_housing_dataframe = CSV.read("california_housing_train.csv", delim=",");
#california_housing_dataframe = california_housing_dataframe[shuffle(1:size(california_housing_dataframe, 1)),:];
In [2]:
function preprocess_features(california_housing_dataframe)
"""Prepares input features from California housing data set.
Args:
california_housing_dataframe: A DataFrame expected to contain data
from the California housing data set.
Returns:
A DataFrame that contains the features to be used for the model, including
synthetic features.
"""
selected_features = california_housing_dataframe[
[:latitude,
:longitude,
:housing_median_age,
:total_rooms,
:total_bedrooms,
:population,
:households,
:median_income]]
processed_features = selected_features
# Create a synthetic feature.
processed_features[:rooms_per_person] = (
california_housing_dataframe[:total_rooms] ./
california_housing_dataframe[:population])
return processed_features
end
function preprocess_targets(california_housing_dataframe)
"""Prepares target features (i.e., labels) from California housing data set.
Args:
california_housing_dataframe: A DataFrame expected to contain data
from the California housing data set.
Returns:
A DataFrame that contains the target feature.
"""
output_targets = DataFrame()
# Scale the target to be in units of thousands of dollars.
output_targets[:median_house_value] = (
california_housing_dataframe[:median_house_value] ./ 1000.0)
return output_targets
end
Out[2]:
In [15]:
training_examples = preprocess_features(head(california_housing_dataframe,12000))
describe(training_examples)
Out[15]:
In [16]:
training_targets = preprocess_targets(head(california_housing_dataframe,12000))
describe(training_targets)
Out[16]:
In [17]:
validation_examples = preprocess_features(tail(california_housing_dataframe,5000))
describe(validation_examples)
Out[17]:
In [18]:
validation_targets = preprocess_targets(tail(california_housing_dataframe,5000))
describe(validation_targets)
Out[18]:
In [30]:
ax1=scatter(validation_examples[:longitude],
validation_examples[:latitude],
color=:coolwarm,
zcolor=validation_targets[:median_house_value] ./ maximum(validation_targets[:median_house_value]),
ms=5,
markerstrokecolor=false,
title="Validation Data",
ylim=[32,43],
xlim=[-126,-112])
ax2=scatter(training_examples[:longitude],
training_examples[:latitude],
color=:coolwarm,
zcolor=training_targets[:median_house_value] ./ maximum(training_targets[:median_house_value]),
markerstrokecolor=false,
ms=5,
title="Training Data",
ylim=[32,43],
xlim=[-126,-112]);
#plot(ax1, ax2, legend=false, colorbar=false, layout=(1,2))
In [19]:
function create_batches(features, targets, steps, batch_size=5, num_epochs=0)
if(num_epochs==0)
num_epochs=ceil(batch_size*steps/size(features,1))
end
names_features=names(features);
names_targets=names(targets);
features_batches=copy(features)
target_batches=copy(targets)
for i=1:num_epochs
select=shuffle(1:size(features,1))
if i==1
features_batches=(features[select,:])
target_batches=(targets[select,:])
else
append!(features_batches, features[select,:])
append!(target_batches, targets[select,:])
end
end
return features_batches, target_batches
end
Out[19]:
In [20]:
function next_batch(features_batches, targets_batches, batch_size, iter)
select=mod((iter-1)*batch_size+1, size(features_batches,1)):mod(iter*batch_size, size(features_batches,1));
ds=features_batches[select,:];
target=targets_batches[select,:];
return ds, target
end
Out[20]:
In [21]:
function my_input_fn(features_batches, targets_batches, iter, batch_size=5, shuffle_flag=1):
"""Trains a linear regression model of one feature.
Args:
features: DataFrame of features
targets: DataFrame of targets
batch_size: Size of batches to be passed to the model
shuffle: True or False. Whether to shuffle the data.
num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
Returns:
Tuple of (features, labels) for next data batch
"""
# Convert pandas data into a dict of np arrays.
#features = {key:np.array(value) for key,value in dict(features).items()}
# Construct a dataset, and configure batching/repeating.
#ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
ds, target = next_batch(features_batches, targets_batches, batch_size, iter)
# Shuffle the data, if specified.
if shuffle_flag==1
select=shuffle(1:size(ds, 1));
ds = ds[select,:]
target = target[select, :]
end
# Return the next batch of data.
# features, labels = ds.make_one_shot_iterator().get_next()
return ds, target
end
Out[21]:
In [23]:
function construct_columns(input_features)
"""Construct the TensorFlow Feature Columns.
Args:
input_features: A dataframe of numerical input features to use.
Returns:
A set of feature columns
"""
out=convert(Array, input_features[:,:])
return convert.(Float64,out)
end
Out[23]:
In [24]:
function train_model(learning_rate,
steps,
batch_size,
training_examples,
training_targets,
validation_examples,
validation_targets)
"""Trains a linear regression model of one feature.
Args:
learning_rate: A `float`, the learning rate.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
training_examples: A dataframe of training examples.
training_targets: A column of training targets.
validation_examples: A dataframe of validation examples.
validation_targets: A column of validation targets.
"""
periods = 10
steps_per_period = steps / periods
# Create feature columns.
feature_columns = placeholder(Float32)
target_columns = placeholder(Float32)
# Create a linear regressor object.
m=Variable(zeros(length(training_examples),1))
b=Variable(0.0)
y=(feature_columns*m) .+ b
loss=reduce_sum((target_columns - y).^2)
run(sess, global_variables_initializer())
features_batches, targets_batches = create_batches(training_examples, training_targets, steps, batch_size)
# Advanced gradient decent with gradient clipping
my_optimizer=(train.GradientDescentOptimizer(learning_rate))
gvs = train.compute_gradients(my_optimizer, loss)
capped_gvs = [(clip_by_norm(grad, 5.), var) for (grad, var) in gvs]
my_optimizer = train.apply_gradients(my_optimizer,capped_gvs)
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
println("Training model...")
println("RMSE (on training data):")
training_rmse = []
validation_rmse=[]
for period in 1:periods
# Train the model, starting from the prior state.
for i=1:steps_per_period
features, labels = my_input_fn(features_batches, targets_batches, convert(Int,(period-1)*steps_per_period+i), batch_size)
run(sess, my_optimizer, Dict(feature_columns=>construct_columns(features), target_columns=>construct_columns(labels)))
end
# Take a break and compute predictions.
training_predictions = run(sess, y, Dict(feature_columns=> construct_columns(training_examples)));
validation_predictions = run(sess, y, Dict(feature_columns=> construct_columns(validation_examples)));
# Compute loss.
training_mean_squared_error = mean((training_predictions- construct_columns(training_targets)).^2)
training_root_mean_squared_error = sqrt(training_mean_squared_error)
validation_mean_squared_error = mean((validation_predictions- construct_columns(validation_targets)).^2)
validation_root_mean_squared_error = sqrt(validation_mean_squared_error)
# Occasionally print the current loss.
println(" period ", period, ": ", training_root_mean_squared_error)
# Add the loss metrics from this period to our list.
push!(training_rmse, training_root_mean_squared_error)
push!(validation_rmse, validation_root_mean_squared_error)
end
weight = run(sess,m)
bias = run(sess,b)
println("Model training finished.")
# Output a graph of loss metrics over periods.
p1=plot(training_rmse, label="training", title="Root Mean Squared Error vs. Periods", ylabel="RMSE", xlabel="Periods")
p1=plot!(validation_rmse, label="validation")
println("Final RMSE (on training data): ", training_rmse[end])
println("Final Weight (on training data): ", weight)
println("Final Bias (on training data): ", bias)
return weight, bias, p1
end
Out[24]:
In [25]:
weight, bias, p1 = train_model(
# TWEAK THESE VALUES TO SEE HOW MUCH YOU CAN IMPROVE THE RMSE
0.00003, #learning rate
500, #steps
5, #batch_size
training_examples,
training_targets,
validation_examples,
validation_targets)
Out[25]:
In [26]:
plot(p1)
Out[26]:
In [27]:
california_housing_test_data = CSV.read("california_housing_test.csv", delim=",");
test_examples = preprocess_features(california_housing_test_data)
test_targets = preprocess_targets(california_housing_test_data)
test_predictions = construct_columns(test_examples)*weight .+ bias
test_mean_squared_error = mean((test_predictions- construct_columns(test_targets)).^2)
test_root_mean_squared_error = sqrt(test_mean_squared_error)
print("Final RMSE (on test data): ", test_root_mean_squared_error)
In [28]:
# end of file