Re-posted from: https://tensorflowjulia.blogspot.com/2018/08/synthetic-features-and-outliers.html
In this second part, we create a synthetic feature and remove some outliers from the data set.
The Jupyter notebook can be downloaded here. For the version displayed below, I needed to remove some scatter plots.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
using Plots
gr()
using DataFrames
using TensorFlow
import CSV
sess=Session()
california_housing_dataframe = CSV.read("california_housing_train.csv", delim=",");
california_housing_dataframe[:median_house_value] /= 1000.0
california_housing_dataframe
function create_batches(features, targets, steps, batch_size=5, num_epochs=0)
if(num_epochs==0)
num_epochs=ceil(batch_size*steps/length(features))
end
features_batches=Union{Float64, Missings.Missing}[]
target_batches=Union{Float64, Missings.Missing}[]
for i=1:num_epochs
select=shuffle(1:length(features))
append!(features_batches, features[select])
append!(target_batches, targets[select])
end
return features_batches, target_batches
end
function next_batch(features_batches, targets_batches, batch_size, iter)
select=mod((iter-1)*batch_size+1, length(features_batches)):mod(iter*batch_size, length(features_batches));
ds=features_batches[select];
target=targets_batches[select];
return ds, target
end
function my_input_fn(features_batches, targets_batches, iter, batch_size=5, shuffle_flag=1):
"""Trains a linear regression model of one feature.
Args:
features: DataFrame of features
targets: DataFrame of targets
batch_size: Size of batches to be passed to the model
shuffle: True or False. Whether to shuffle the data.
num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
Returns:
Tuple of (features, labels) for next data batch
"""
# Construct a dataset, and configure batching/repeating.
ds, target = next_batch(features_batches, targets_batches, batch_size, iter)
# Shuffle the data, if specified.
if shuffle_flag==1
select=shuffle(1:size(ds, 1));
ds = ds[select,:]
target = target[select, :]
end
# Return the next batch of data.
return convert.(Float64,ds), convert.(Float64,target)
end
function train_model(learning_rate, steps, batch_size, input_feature=:total_rooms)
"""Trains a linear regression model of one feature.
Args:
learning_rate: A `float`, the learning rate.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
input_feature: A `symbol` specifying a column from `california_housing_dataframe`
to use as input feature.
"""
periods = 10
steps_per_period = steps / periods
my_feature = input_feature
my_feature_data = convert.(Float32,california_housing_dataframe[my_feature])
my_label = :median_house_value
targets = convert.(Float32,california_housing_dataframe[my_label])
# Create feature columns.
feature_columns = placeholder(Float32)
target_columns = placeholder(Float32)
# Create a linear regressor object.
m=Variable(0.0)
b=Variable(0.0)
y=m.*feature_columns .+ b
loss=reduce_sum((target_columns - y).^2)
run(sess, global_variables_initializer())
features_batches, targets_batches = create_batches(my_feature_data, targets, steps, batch_size)
# Use gradient descent as the optimizer for training the model.
#my_optimizer=train.minimize(train.GradientDescentOptimizer(learning_rate), loss)
my_optimizer=(train.GradientDescentOptimizer(learning_rate))
gvs = train.compute_gradients(my_optimizer, loss)
capped_gvs = [(clip_by_norm(grad, 5.), var) for (grad, var) in gvs]
my_optimizer = train.apply_gradients(my_optimizer,capped_gvs)
# Set up to plot the state of our model's line each period.
sample = california_housing_dataframe[rand(1:size(california_housing_dataframe,1), 300),:];
p1=scatter(sample[my_feature], sample[my_label], title="Learned Line by Period", ylabel=my_label, xlabel=my_feature,color=:coolwarm)
colors= [ColorGradient(:coolwarm)[i] for i in linspace(0,1, periods+1)]
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
println("Training model...")
println("RMSE (on training data):")
root_mean_squared_errors = []
for period in 1:periods
# Train the model, starting from the prior state.
for i=1:steps_per_period
features, labels = my_input_fn(features_batches, targets_batches, convert(Int,(period-1)*steps_per_period+i), batch_size)
run(sess, my_optimizer, Dict(feature_columns=>features, target_columns=>labels))
end
# Take a break and compute predictions.
predictions = run(sess, y, Dict(feature_columns=> my_feature_data));
# Compute loss.
mean_squared_error = mean((predictions- targets).^2)
root_mean_squared_error = sqrt(mean_squared_error)
# Occasionally print the current loss.
println(" period ", period, ": ", root_mean_squared_error)
# Add the loss metrics from this period to our list.
push!(root_mean_squared_errors, root_mean_squared_error)
# Finally, track the weights and biases over time.
# Apply some math to ensure that the data and line are plotted neatly.
y_extents = [0 maximum(sample[my_label])]
weight = run(sess,m)
bias = run(sess,b)
x_extents = (y_extents - bias) / weight
x_extents = max.(min.(x_extents, maximum(sample[my_feature])),
minimum(sample[my_feature]))
y_extents = weight .* x_extents .+ bias
p1=plot!(x_extents', y_extents', color=colors[period], linewidth=2)
end
predictions = run(sess, y, Dict(feature_columns=> my_feature_data));
weight = run(sess,m)
bias = run(sess,b)
println("Model training finished.")
# Output a graph of loss metrics over periods.
p2=plot(root_mean_squared_errors, title="Root Mean Squared Error vs. Periods", ylabel="RMSE", xlabel="Periods")
# Output a table with calibration data.
calibration_data = DataFrame()
calibration_data[:predictions] = predictions
calibration_data[:targets] = targets
describe(calibration_data)
println("Final RMSE (on training data): ", root_mean_squared_errors[end])
println("Final Weight (on training data): ", weight)
println("Final Bias (on training data): ", bias)
return p1, p2, calibration_data
end
california_housing_dataframe[:rooms_per_person] =(
california_housing_dataframe[:total_rooms] ./ california_housing_dataframe[:population]);
p1, p2, calibration_data= train_model(
0.05, # learning rate
1000, # steps
5, # batch size
:rooms_per_person #feature
)
plot(p1, p2, layout=(1,2), legend=false)
#scatter(calibration_data[:predictions], calibration_data[:targets], legend=false)
histogram(california_housing_dataframe[:rooms_per_person], nbins=20, legend=false)
california_housing_dataframe[:rooms_per_person] = min.(
california_housing_dataframe[:rooms_per_person],5)
histogram(california_housing_dataframe[:rooms_per_person], nbins=20, legend=false)
p1, p2, calibration_data= train_model(
0.05, # learning rate
500, # steps
10, # batch size
:rooms_per_person #feature
)
plot(p1, p2, layout=(1,2), legend=false)
#scatter(calibration_data[:predictions], calibration_data[:targets], legend=false)
# end of file