PageRenderTime 66ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py

https://gitlab.com/github-cloud-corporation/tensorflow
Python | 354 lines | 320 code | 14 blank | 20 comment | 12 complexity | a7e768bd1cae4ac465b0d83a0962eca3 MD5 | raw file
  1. # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ==============================================================================
  15. """Tests for learn.dataframe.tensorflow_dataframe."""
  16. from __future__ import absolute_import
  17. from __future__ import division
  18. from __future__ import print_function
  19. import csv
  20. import math
  21. import tempfile
  22. import numpy as np
  23. import tensorflow as tf
  24. from tensorflow.contrib.learn.python.learn.dataframe import tensorflow_dataframe as df
  25. from tensorflow.contrib.learn.python.learn.dataframe.transforms import densify
  26. from tensorflow.core.example import example_pb2
  27. from tensorflow.python.framework import dtypes
  28. # pylint: disable=g-import-not-at-top
  29. try:
  30. import pandas as pd
  31. HAS_PANDAS = True
  32. except ImportError:
  33. HAS_PANDAS = False
  34. def _assert_df_equals_dict(expected_df, actual_dict):
  35. for col in expected_df:
  36. if expected_df[col].dtype in [np.float32, np.float64]:
  37. assertion = np.testing.assert_allclose
  38. else:
  39. assertion = np.testing.assert_array_equal
  40. if expected_df[col].dtype.kind in ["O", "S", "U"]:
  41. # Python 2/3 compatibility
  42. # TensorFlow always returns bytes, so we just convert the unicode
  43. # expectations to bytes also before comparing.
  44. expected_values = [x.encode("utf-8") for x in expected_df[col].values]
  45. else:
  46. expected_values = expected_df[col].values
  47. assertion(expected_values,
  48. actual_dict[col],
  49. err_msg="Expected {} in column '{}'; got {}.".format(
  50. expected_values, col, actual_dict[col]))
  51. def _make_test_csv():
  52. f = tempfile.NamedTemporaryFile(delete=False, mode="w")
  53. w = csv.writer(f)
  54. w.writerow(["int", "float", "bool", "string"])
  55. for _ in range(100):
  56. intvalue = np.random.randint(-10, 10)
  57. floatvalue = np.random.rand()
  58. boolvalue = int(np.random.rand() > 0.3)
  59. stringvalue = "S: %.4f" % np.random.rand()
  60. row = [intvalue, floatvalue, boolvalue, stringvalue]
  61. w.writerow(row)
  62. f.close()
  63. return f.name
  64. def _make_test_csv_sparse():
  65. f = tempfile.NamedTemporaryFile(delete=False, mode="w")
  66. w = csv.writer(f)
  67. w.writerow(["int", "float", "bool", "string"])
  68. for _ in range(100):
  69. # leave columns empty; these will be read as default value (e.g. 0 or NaN)
  70. intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else ""
  71. floatvalue = np.random.rand() if np.random.rand() > 0.5 else ""
  72. boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else ""
  73. stringvalue = (("S: %.4f" % np.random.rand())
  74. if np.random.rand() > 0.5 else "")
  75. row = [intvalue, floatvalue, boolvalue, stringvalue]
  76. w.writerow(row)
  77. f.close()
  78. return f.name
  79. def _make_test_tfrecord():
  80. f = tempfile.NamedTemporaryFile(delete=False)
  81. w = tf.python_io.TFRecordWriter(f.name)
  82. for i in range(100):
  83. ex = example_pb2.Example()
  84. ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
  85. ex.features.feature["fixed_len_float"].float_list.value.extend(
  86. [float(i), 2 * float(i)])
  87. w.write(ex.SerializeToString())
  88. return f.name
  89. class TensorFlowDataFrameTestCase(tf.test.TestCase):
  90. """Tests for `TensorFlowDataFrame`."""
  91. def _assert_pandas_equals_tensorflow(self, pandas_df, tensorflow_df,
  92. num_batches, batch_size):
  93. self.assertItemsEqual(
  94. list(pandas_df.columns) + ["index"], tensorflow_df.columns())
  95. for batch_num, batch in enumerate(tensorflow_df.run(num_batches)):
  96. row_numbers = [
  97. total_row_num % pandas_df.shape[0]
  98. for total_row_num in range(batch_size * batch_num, batch_size * (
  99. batch_num + 1))
  100. ]
  101. expected_df = pandas_df.iloc[row_numbers]
  102. _assert_df_equals_dict(expected_df, batch)
  103. def testInitFromPandas(self):
  104. """Test construction from Pandas DataFrame."""
  105. if not HAS_PANDAS:
  106. return
  107. pandas_df = pd.DataFrame({"sparrow": range(10), "ostrich": 1})
  108. tensorflow_df = df.TensorFlowDataFrame.from_pandas(pandas_df,
  109. batch_size=10,
  110. shuffle=False)
  111. batch = tensorflow_df.run_once()
  112. np.testing.assert_array_equal(pandas_df.index.values, batch["index"],
  113. "Expected index {}; got {}".format(
  114. pandas_df.index.values, batch["index"]))
  115. _assert_df_equals_dict(pandas_df, batch)
  116. def testBatch(self):
  117. """Tests `batch` method.
  118. `DataFrame.batch()` should iterate through the rows of the
  119. `pandas.DataFrame`, and should "wrap around" when it reaches the last row.
  120. """
  121. if not HAS_PANDAS:
  122. return
  123. pandas_df = pd.DataFrame({"albatross": range(10),
  124. "bluejay": 1,
  125. "cockatoo": range(0, 20, 2),
  126. "penguin": list("abcdefghij")})
  127. tensorflow_df = df.TensorFlowDataFrame.from_pandas(pandas_df, shuffle=False)
  128. # Rebatch `df` into the following sizes successively.
  129. batch_sizes = [8, 4, 7]
  130. num_batches = 10
  131. final_batch_size = batch_sizes[-1]
  132. for batch_size in batch_sizes:
  133. tensorflow_df = tensorflow_df.batch(batch_size, shuffle=False)
  134. self._assert_pandas_equals_tensorflow(pandas_df,
  135. tensorflow_df,
  136. num_batches=num_batches,
  137. batch_size=final_batch_size)
  138. def testFromNumpy(self):
  139. x = np.eye(20)
  140. tensorflow_df = df.TensorFlowDataFrame.from_numpy(x, batch_size=10)
  141. for batch in tensorflow_df.run(30):
  142. for ind, val in zip(batch["index"], batch["value"]):
  143. expected_val = np.zeros_like(val)
  144. expected_val[ind] = 1
  145. np.testing.assert_array_equal(expected_val, val)
  146. def testFromCSV(self):
  147. if not HAS_PANDAS:
  148. return
  149. num_batches = 100
  150. batch_size = 8
  151. enqueue_size = 7
  152. data_path = _make_test_csv()
  153. default_values = [0, 0.0, 0, ""]
  154. pandas_df = pd.read_csv(data_path)
  155. tensorflow_df = df.TensorFlowDataFrame.from_csv(
  156. [data_path],
  157. enqueue_size=enqueue_size,
  158. batch_size=batch_size,
  159. shuffle=False,
  160. default_values=default_values)
  161. self._assert_pandas_equals_tensorflow(pandas_df,
  162. tensorflow_df,
  163. num_batches=num_batches,
  164. batch_size=batch_size)
  165. def testFromCSVLimitEpoch(self):
  166. batch_size = 8
  167. num_epochs = 17
  168. expected_num_batches = (num_epochs * 100) // batch_size
  169. data_path = _make_test_csv()
  170. default_values = [0, 0.0, 0, ""]
  171. tensorflow_df = df.TensorFlowDataFrame.from_csv(
  172. [data_path],
  173. batch_size=batch_size,
  174. shuffle=False,
  175. default_values=default_values)
  176. result_batches = list(tensorflow_df.run(num_epochs=num_epochs))
  177. actual_num_batches = len(result_batches)
  178. self.assertEqual(expected_num_batches, actual_num_batches)
  179. # TODO(soergel): figure out how to dequeue the final small batch
  180. expected_rows = 1696 # num_epochs * 100
  181. actual_rows = sum([len(x["int"]) for x in result_batches])
  182. self.assertEqual(expected_rows, actual_rows)
  183. def testFromCSVWithFeatureSpec(self):
  184. if not HAS_PANDAS:
  185. return
  186. num_batches = 100
  187. batch_size = 8
  188. data_path = _make_test_csv_sparse()
  189. feature_spec = {
  190. "int": tf.FixedLenFeature(None, dtypes.int16, np.nan),
  191. "float": tf.VarLenFeature(dtypes.float16),
  192. "bool": tf.VarLenFeature(dtypes.bool),
  193. "string": tf.FixedLenFeature(None, dtypes.string, "")
  194. }
  195. pandas_df = pd.read_csv(data_path, dtype={"string": object})
  196. # Pandas insanely uses NaN for empty cells in a string column.
  197. # And, we can't use Pandas replace() to fix them because nan != nan
  198. s = pandas_df["string"]
  199. for i in range(0, len(s)):
  200. if isinstance(s[i], float) and math.isnan(s[i]):
  201. s[i] = ""
  202. tensorflow_df = df.TensorFlowDataFrame.from_csv_with_feature_spec(
  203. [data_path],
  204. batch_size=batch_size,
  205. shuffle=False,
  206. feature_spec=feature_spec)
  207. # These columns were sparse; re-densify them for comparison
  208. tensorflow_df["float"] = densify.Densify(np.nan)(tensorflow_df["float"])
  209. tensorflow_df["bool"] = densify.Densify(np.nan)(tensorflow_df["bool"])
  210. self._assert_pandas_equals_tensorflow(pandas_df,
  211. tensorflow_df,
  212. num_batches=num_batches,
  213. batch_size=batch_size)
  214. def testFromExamples(self):
  215. num_batches = 77
  216. enqueue_size = 11
  217. batch_size = 13
  218. data_path = _make_test_tfrecord()
  219. features = {
  220. "fixed_len_float": tf.FixedLenFeature(shape=[2],
  221. dtype=tf.float32,
  222. default_value=[0.0, 0.0]),
  223. "var_len_int": tf.VarLenFeature(dtype=tf.int64)
  224. }
  225. tensorflow_df = df.TensorFlowDataFrame.from_examples(
  226. data_path,
  227. enqueue_size=enqueue_size,
  228. batch_size=batch_size,
  229. features=features,
  230. shuffle=False)
  231. # `test.tfrecord` contains 100 records with two features: var_len_int and
  232. # fixed_len_float. Entry n contains `range(n % 3)` and
  233. # `float(n)` for var_len_int and fixed_len_float,
  234. # respectively.
  235. num_records = 100
  236. def _expected_fixed_len_float(n):
  237. return np.array([float(n), 2 * float(n)])
  238. def _expected_var_len_int(n):
  239. return np.arange(n % 3)
  240. for batch_num, batch in enumerate(tensorflow_df.run(num_batches)):
  241. record_numbers = [
  242. n % num_records
  243. for n in range(batch_num * batch_size, (batch_num + 1) * batch_size)
  244. ]
  245. for i, j in enumerate(record_numbers):
  246. np.testing.assert_allclose(
  247. _expected_fixed_len_float(j), batch["fixed_len_float"][i])
  248. var_len_int = batch["var_len_int"]
  249. for i, ind in enumerate(var_len_int.indices):
  250. val = var_len_int.values[i]
  251. expected_row = _expected_var_len_int(record_numbers[ind[0]])
  252. expected_value = expected_row[ind[1]]
  253. np.testing.assert_array_equal(expected_value, val)
  254. def testSplitString(self):
  255. batch_size = 8
  256. num_epochs = 17
  257. expected_num_batches = (num_epochs * 100) // batch_size
  258. data_path = _make_test_csv()
  259. default_values = [0, 0.0, 0, ""]
  260. tensorflow_df = df.TensorFlowDataFrame.from_csv(
  261. [data_path],
  262. batch_size=batch_size,
  263. shuffle=False,
  264. default_values=default_values)
  265. a, b = tensorflow_df.split("string", 0.7) # no rebatching
  266. total_result_batches = list(tensorflow_df.run(num_epochs=num_epochs))
  267. a_result_batches = list(a.run(num_epochs=num_epochs))
  268. b_result_batches = list(b.run(num_epochs=num_epochs))
  269. self.assertEqual(expected_num_batches, len(total_result_batches))
  270. self.assertEqual(expected_num_batches, len(a_result_batches))
  271. self.assertEqual(expected_num_batches, len(b_result_batches))
  272. total_rows = sum([len(x["int"]) for x in total_result_batches])
  273. a_total_rows = sum([len(x["int"]) for x in a_result_batches])
  274. b_total_rows = sum([len(x["int"]) for x in b_result_batches])
  275. print("Split rows: %s => %s, %s" % (total_rows, a_total_rows, b_total_rows))
  276. # TODO(soergel): figure out how to dequeue the final small batch
  277. expected_total_rows = 1696 # (num_epochs * 100)
  278. self.assertEqual(expected_total_rows, total_rows)
  279. self.assertEqual(1087, a_total_rows) # stochastic but deterministic
  280. # self.assertEqual(int(total_rows * 0.7), a_total_rows)
  281. self.assertEqual(609, b_total_rows) # stochastic but deterministic
  282. # self.assertEqual(int(total_rows * 0.3), b_total_rows)
  283. # The strings used for hashing were all unique in the original data, but
  284. # we ran 17 epochs, so each one should appear 17 times. Each copy should
  285. # be hashed into the same partition, so there should be no overlap of the
  286. # keys.
  287. a_strings = set([s for x in a_result_batches for s in x["string"]])
  288. b_strings = set([s for x in b_result_batches for s in x["string"]])
  289. self.assertEqual(frozenset(), a_strings & b_strings)
  290. if __name__ == "__main__":
  291. tf.test.main()