Re-posted from: https://tensorflowjulia.blogspot.com/2018/09/conversion-of-movie-review-data-to-one.html
In the last post, we obtained the files test_data.h5 and train_data.h5, containing text data from movie reviews (from the ACL 2011 IMDB dataset). In the next exercise, we need to access a one-hot encoded version of these files, based on a large vocabulary. The following code converts the data and stores it on disk for later use. It takes about two hours to run on my laptop and uses 13GB of storage for the converted file.
The Jupyter notebook can be downloaded here.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
In [3]:
using HDF5
using JLD
In [4]:
# function for creating categorial colum from vocabulary list in one hot encoding
function create_data_columns(data, informative_terms)
onehotmat=zeros(length(data), length(informative_terms))
for i=1:length(data)
string=data[i]
for j=1:length(informative_terms)
if contains(string, informative_terms[j])
onehotmat[i,j]=1
end
end
end
return onehotmat
end
Out[4]:
In [5]:
c = h5open("train_data.h5", "r") do file
global train_labels=read(file, "output_labels")
global train_features=read(file, "output_features")
end
c = h5open("test_data.h5", "r") do file
global test_labels=read(file, "output_labels")
global test_features=read(file, "output_features")
end
train_labels=train_labels'
test_labels=test_labels';
In [6]:
vocabulary=Array{String}(0)
open("terms.txt") do file
for ln in eachline(file)
push!(vocabulary, ln)
end
end
In [7]:
# This takes a looong time. Only run it once and save the result
train_features_full=create_data_columns(train_features, vocabulary)
test_features_full=create_data_columns(test_features, vocabulary);
In [8]:
save("IMDB_fullmatrix_datacolumns.jld", "train_features_full", train_features_full, "test_features_full", test_features_full)