I am trying to write a csv file (all columns are floats) to a tfrecords file then read them back out. All the examples I have seen pack the csv columns then feed it to sess.run() directly but I can't figure out how to write the feature columns and label column to a tfrecord instead. How could I do this?
4 Answers
You will need a separate script to convert your csv file to TFRecords.
Imagine you have a CSV with the following header:
feature_1, feature_2, ..., feature_n, label
You need to read your CSV with something like pandas
, construct tf.train.Example
manually and then write it to file with TFRecordWriter
csv = pandas.read_csv("your.csv").values
with tf.python_io.TFRecordWriter("csv.tfrecords") as writer:
for row in csv:
features, label = row[:-1], row[-1]
example = tf.train.Example()
example.features.feature["features"].float_list.value.extend(features)
example.features.feature["label"].int64_list.value.append(label)
writer.write(example.SerializeToString())
-
Seems like this code will only allow you to add float features. You would need a way to adjust the code for Int64 features, or especially categorical ByteList features.– krishnabCommented Oct 28, 2018 at 5:39
-
Why using a int64_list for the label when it is only a single value and not a list– azertyCommented Oct 29, 2018 at 11:07
-
3
The above solution not worked in my case.Another way to read csv file and create tfRecord is shown below:
The feature set column names are :Sl.No:,Time,Height, Width,Mean,Std, Variance, Non-homogeneity, PixelCount, contourCount, Class.
Sample features that we get from dataset.csv:
Features= [5, 'D', 268, 497, 13.706, 863.4939, 29.385, 0.0427, 39675, 10]
label : medium
import pandas as pd
import tensorflow as tf
def create_tf_example(features, label):
tf_example = tf.train.Example(features=tf.train.Features(feature={
'Time': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
'Height':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[2]])),
'Width':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[3]])),
'Mean':tf.train.Feature(float_list=tf.train.FloatList(value=[features[4]])),
'Std':tf.train.Feature(float_list=tf.train.FloatList(value=[features[5]])),
'Variance':tf.train.Feature(float_list=tf.train.FloatList(value=[features[6]])),
'Non-homogeneity':tf.train.Feature(float_list=tf.train.FloatList(value=[features[7]])),
'PixelCount':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[8]])),
'contourCount':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[9]])),
'Class':tf.train.Feature(bytes_list=tf.train.BytesList(value=[label.encode('utf-8')])),
}))
return tf_example
csv = pd.read_csv("dataset.csv").values
with tf.python_io.TFRecordWriter("dataset.tfrecords") as writer:
for row in csv:
features, label = row[:-1], row[-1]
print features, label
example = create_tf_example(features, label)
writer.write(example.SerializeToString())
writer.close()
For more details click here.This works for me, hope it works.
-
1Thanks for the example. In TF 2.x
TFRecordWriter
has been moved totf.io.TFRecordWriter
– NitinCommented Nov 27, 2020 at 4:47
def convert_to():
filename = os.path.join(wdir, 'ml-100k' + '.tfrecords')
print('Writing', filename)
with tf.python_io.TFRecordWriter(filename) as writer:
with open("/Users/shishir/Documents/botconnect_Playground/tfRecords/ml-100k.train.rating", "r") as f:
line = f.readline()
while line != None and line != "":
arr = line.split("\t")
u, i, l = int(arr[0]), int(arr[1]), int(arr[2])
u_arr = np.reshape(u,[1]).astype('int64')
i_arr = np.reshape(i,[1]).astype('int64')
l_arr = np.reshape(l,[1]).astype('int64')
example = tf.train.Example()
example.features.feature["user"].int64_list.value.extend(u_arr)
example.features.feature["item"].int64_list.value.extend(i_arr)
example.features.feature["label"].int64_list.value.append(int(l_arr))
writer.write(example.SerializeToString())
line = f.readline()
So that is my Solution and it works! Hope this helps
Cheers.
-
Thank you for this code snippet, which might provide some limited short-term help. A proper explanation would greatly improve its long-term value by showing why this is a good solution to the problem, and would make it more useful to future readers with other, similar questions. Please edit your answer to add some explanation, including the assumptions you've made. Commented Feb 1, 2018 at 10:43
@Nija I Pillai 's answer for tensorflow 2
import pandas as pd
import tensorflow as tf
def create_tf_example(features, label):
tf_example = tf.train.Example(features=tf.train.Features(feature={
'attr1': tf.train.Feature(bytes_list=tf.train.BytesList(value=[features[1].encode('utf-8')])),
'attr2':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[2]])),
'attr3':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[3]])),
'attr4':tf.train.Feature(float_list=tf.train.FloatList(value=[features[4]])),
'attr5':tf.train.Feature(float_list=tf.train.FloatList(value=[features[5]])),
'attr6':tf.train.Feature(float_list=tf.train.FloatList(value=[features[6]])),
'attr7':tf.train.Feature(float_list=tf.train.FloatList(value=[features[7]])),
'attr8':tf.train.Feature(int64_list=tf.train.Int64List(value=[features[8]]))
}))
return tf_example
csv = pd.read_csv("dataset.csv").values
with tf.io.TFRecordWriter("dataset.tfrecords") as writer:
for row in csv:
features, label = row[:-1], row[-1]
print(features, label)
example = create_tf_example(features, label)
writer.write(example.SerializeToString())
writer.close()