Re-posted from: https://tensorflowjulia.blogspot.com/2018/08/feature-sets.html
The fourth part of the Machine Learning Crash Course deals with finding a minimal set of features that still gives a reasonable model.
The code makes use of two useful functions when dealing with DataFrames:
- names() returns the names of the different columns. This allows for the creation of a DataFrame that contains the correlation matrix with the correct column names – see the line
DataFrame([cor(df[:, a], df[:, b]) for a=1:size(df, 2), b=1:size(df, 2)], names(df)) - On the other hand, if you programatically need to create new names for a DataFrame, you can use Symbol() to convert from a string. We used this when splitting the latitude data up into several buckets:
Symbol(string(“latitude_”, range[1],”_”, range[2]))
The Jupyter notebook can be downloaded here. For the version displayed below, I needed to remove some scatter plots, which are contained in the original file.
In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
In [1]:
using Plots
gr()
using DataFrames
using TensorFlow
import CSV
import StatsBase
sess=Session()
california_housing_dataframe = CSV.read("california_housing_train.csv", delim=",");
california_housing_dataframe = california_housing_dataframe[shuffle(1:size(california_housing_dataframe, 1)),:];
In [2]:
function preprocess_features(california_housing_dataframe)
"""Prepares input features from California housing data set.
Args:
california_housing_dataframe: A DataFrame expected to contain data
from the California housing data set.
Returns:
A DataFrame that contains the features to be used for the model, including
synthetic features.
"""
selected_features = california_housing_dataframe[
[:latitude,
:longitude,
:housing_median_age,
:total_rooms,
:total_bedrooms,
:population,
:households,
:median_income]]
processed_features = selected_features
# Create a synthetic feature.
processed_features[:rooms_per_person] = (
california_housing_dataframe[:total_rooms] ./
california_housing_dataframe[:population])
return processed_features
end
function preprocess_targets(california_housing_dataframe)
"""Prepares target features (i.e., labels) from California housing data set.
Args:
california_housing_dataframe: A DataFrame expected to contain data
from the California housing data set.
Returns:
A DataFrame that contains the target feature.
"""
output_targets = DataFrame()
# Scale the target to be in units of thousands of dollars.
output_targets[:median_house_value] = (
california_housing_dataframe[:median_house_value] ./ 1000.0)
return output_targets
end
Out[2]:
In [24]:
# Choose the first 12000 (out of 17000) examples for training.
training_examples = preprocess_features(head(california_housing_dataframe,12000))
training_targets = preprocess_targets(head(california_housing_dataframe,12000))
# Choose the last 5000 (out of 17000) examples for validation.
validation_examples = preprocess_features(tail(california_housing_dataframe,5000))
validation_targets = preprocess_targets(tail(california_housing_dataframe,5000))
# Double-check that we've done the right thing.
println("Training examples summary:")
describe(training_examples)
println("Validation examples summary:")
describe(validation_examples)
println("Training targets summary:")
describe(training_targets)
println("Validation targets summary:")
describe(validation_targets)
Out[24]:
In [5]:
function cordf(df::DataFrame)
out=DataFrame([cor(df[:, a], df[:, b]) for a=1:size(df, 2), b=1:size(df, 2)], names(df))
return(out)
end
Out[5]:
In [6]:
correlation_dataframe = copy(training_examples)
correlation_dataframe[:target] = training_targets[:median_house_value]
out=cordf(correlation_dataframe)
Out[6]:
In [7]:
function construct_columns(input_features):
"""Construct the Feature Columns.
Args:
input_features: Numerical input features to use.
Returns:
A set of converted feature columns
"""
out=convert(Array, input_features[:,:])
return convert.(Float64,out)
end
Out[7]:
In [8]:
function create_batches(features, targets, steps, batch_size=5, num_epochs=0)
if(num_epochs==0)
num_epochs=ceil(batch_size*steps/size(features,1))
end
names_features=names(features);
names_targets=names(targets);
features_batches=copy(features)
target_batches=copy(targets)
for i=1:num_epochs
select=shuffle(1:size(features,1))
if i==1
features_batches=(features[select,:])
target_batches=(targets[select,:])
else
append!(features_batches, features[select,:])
append!(target_batches, targets[select,:])
end
end
return features_batches, target_batches
end
Out[8]:
In [9]:
function next_batch(features_batches, targets_batches, batch_size, iter)
select=mod((iter-1)*batch_size+1, size(features_batches,1)):mod(iter*batch_size, size(features_batches,1));
ds=features_batches[select,:];
target=targets_batches[select,:];
return ds, target
end
Out[9]:
In [10]:
function my_input_fn(features_batches, targets_batches, iter, batch_size=5, shuffle_flag=1):
"""Trains a linear regression model of one feature.
Args:
features: DataFrame of features
targets: DataFrame of targets
batch_size: Size of batches to be passed to the model
shuffle: True or False. Whether to shuffle the data.
num_epochs: Number of epochs for which data should be repeated. None = repeat indefinitely
Returns:
Tuple of (features, labels) for next data batch
"""
# Convert pandas data into a dict of np arrays.
#features = {key:np.array(value) for key,value in dict(features).items()}
# Construct a dataset, and configure batching/repeating.
#ds = Dataset.from_tensor_slices((features,targets)) # warning: 2GB limit
ds, target = next_batch(features_batches, targets_batches, batch_size, iter)
# Shuffle the data, if specified.
if shuffle_flag==1
select=shuffle(1:size(ds, 1));
ds = ds[select,:]
target = target[select, :]
end
# Return the next batch of data.
# features, labels = ds.make_one_shot_iterator().get_next()
return ds, target
end
Out[10]:
In [11]:
function train_model(learning_rate,
steps,
batch_size,
training_examples,
training_targets,
validation_examples,
validation_targets)
"""Trains a linear regression model of one feature.
Args:
learning_rate: A `float`, the learning rate.
steps: A non-zero `int`, the total number of training steps. A training step
consists of a forward and backward pass using a single batch.
batch_size: A non-zero `int`, the batch size.
input_feature: A column from `california_housing_dataframe`
to use as input feature.
"""
periods = 10
steps_per_period = steps / periods
# Create feature columns.
feature_columns = placeholder(Float32)
target_columns = placeholder(Float32)
# Create a linear regressor object.
# Configure the linear regression model with our feature columns and optimizer.
m=Variable(zeros(length(training_examples),1))
b=Variable(0.0)
y=(feature_columns*m) .+ b
loss=reduce_sum((target_columns - y).^2)
run(sess, global_variables_initializer())
features_batches, targets_batches = create_batches(training_examples, training_targets, steps, batch_size)
# Advanced gradient decent with gradient clipping
my_optimizer=(train.GradientDescentOptimizer(learning_rate))
gvs = train.compute_gradients(my_optimizer, loss)
capped_gvs = [(clip_by_norm(grad, 5.), var) for (grad, var) in gvs]
my_optimizer = train.apply_gradients(my_optimizer,capped_gvs)
# Train the model, but do so inside a loop so that we can periodically assess
# loss metrics.
println("Training model...")
println("RMSE (on training data):")
training_rmse = []
validation_rmse=[]
for period in 1:periods
# Train the model, starting from the prior state.
for i=1:steps_per_period
features, labels = my_input_fn(features_batches, targets_batches, convert(Int,(period-1)*steps_per_period+i), batch_size)
#println(construct_columns(features))
#println(construct_columns(labels))
run(sess, my_optimizer, Dict(feature_columns=>construct_columns(features), target_columns=>construct_columns(labels)))
end
# Take a break and compute predictions.
training_predictions = run(sess, y, Dict(feature_columns=> construct_columns(training_examples)));
validation_predictions = run(sess, y, Dict(feature_columns=> construct_columns(validation_examples)));
# Compute loss.
training_mean_squared_error = mean((training_predictions- construct_columns(training_targets)).^2)
training_root_mean_squared_error = sqrt(training_mean_squared_error)
validation_mean_squared_error = mean((validation_predictions- construct_columns(validation_targets)).^2)
validation_root_mean_squared_error = sqrt(validation_mean_squared_error)
# Occasionally print the current loss.
println(" period ", period, ": ", training_root_mean_squared_error)
# Add the loss metrics from this period to our list.
push!(training_rmse, training_root_mean_squared_error)
push!(validation_rmse, validation_root_mean_squared_error)
end
weight = run(sess,m)
bias = run(sess,b)
println("Model training finished.")
# Output a graph of loss metrics over periods.
p1=plot(training_rmse, label="training", title="Root Mean Squared Error vs. Periods", ylabel="RMSE", xlabel="Periods")
p1=plot!(validation_rmse, label="validation")
println("Final RMSE (on training data): ", training_rmse[end])
println("Final Weight (on training data): ", weight)
println("Final Bias (on training data): ", bias)
return weight, bias, p1 #, calibration_data
end
Out[11]:
In [12]:
#
# Your code here: add your features of choice as a list of quoted strings.
#
minimal_features = [:latitude,
:median_income,
:rooms_per_person,
:total_bedrooms
]
minimal_training_examples = training_examples[minimal_features]
minimal_validation_examples = validation_examples[minimal_features]
#
# Don't forget to adjust these parameters.
#
weight, bias, p1 = train_model(
# TWEAK THESE VALUES TO SEE HOW MUCH YOU CAN IMPROVE THE RMSE
0.003, #learning rate
500, #steps
5, #batch_size
minimal_training_examples,
training_targets,
minimal_validation_examples,
validation_targets)
Out[12]:
In [13]:
plot(p1)
Out[13]:
In [14]:
minimal_features = [
:median_income,
:latitude,
]
minimal_training_examples = training_examples[minimal_features]
minimal_validation_examples = validation_examples[minimal_features]
weight, bias, p1 = train_model(
# TWEAK THESE VALUES TO SEE HOW MUCH YOU CAN IMPROVE THE RMSE
0.01, #learning rate
500, #steps
5, #batch_size
minimal_training_examples,
training_targets,
minimal_validation_examples,
validation_targets)
Out[14]:
In [15]:
plot(p1)
Out[15]:
In [25]:
#scatter(training_examples[:latitude], training_targets[:median_house_value])
In [17]:
lat1=32:41
lat2=33:42
lat_range=zip(lat1,lat2) # zip creates a set of tuples from vectors
function create_index(value, r1, r2)
if value >=r1 && value <r2
out=1.0
else
out=0.0
end
return out
end
function select_and_transform_features(source_df, lat_range)
selected_examples=DataFrame()
selected_examples[:median_income]=source_df[:median_income]
# Symbol(string) allows to convert a string to a DataFrames name :string
for range in lat_range
selected_examples[Symbol(string("latitude_", range[1],"_", range[2]))]=create_index.(source_df[:latitude], range[1], range[2])
end
return selected_examples
end
Out[17]:
In [19]:
selected_training_examples = select_and_transform_features(training_examples, lat_range)
selected_validation_examples = select_and_transform_features(validation_examples, lat_range);
In [20]:
correlation_dataframe = copy(selected_training_examples)
correlation_dataframe[:target] = training_targets[:median_house_value]
out=cordf(correlation_dataframe)
Out[20]:
In [21]:
weight, bias, p1 = train_model(
# TWEAK THESE VALUES TO SEE HOW MUCH YOU CAN IMPROVE THE RMSE
0.01, #learning rate
1500, #steps
5, #batch_size
selected_training_examples,
training_targets,
selected_validation_examples,
validation_targets)
Out[21]:
In [22]:
plot(p1)
Out[22]:
In [23]:
#EOF