PageRenderTime 52ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 0ms

/test.py

https://bitbucket.org/pombredanne/bloom-filter-indexer
Python | 185 lines | 136 code | 25 blank | 24 comment | 6 complexity | d857bc3851de15963c5a8587c12c16c0 MD5 | raw file
  1. #!/usr/bin/python
  2. # Copyright (c) 2013, Paul Michael Furley <paul@paulfurley.com>
  3. # All rights reserved.
  4. # Redistribution and use in source and binary forms, with or without
  5. # modification, are permitted provided that the following conditions are met:
  6. # - Redistributions of source code must retain the above copyright notice, this
  7. # list of conditions and the following disclaimer.
  8. # - Redistributions in binary form must reproduce the above copyright notice,
  9. # this list of conditions and the following disclaimer in the documentation
  10. # and/or other materials provided with the distribution.
  11. # - Neither the name of the <ORGANIZATION> nor the names of its contributors
  12. # may be used to endorse or promote products derived from this software
  13. # without specific prior written permission.
  14. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  15. # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17. # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  18. # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  19. # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  20. # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  21. # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  22. # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  23. # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  24. # POSSIBILITY OF SUCH DAMAGE.
  25. import unittest
  26. import os
  27. import glob
  28. from cStringIO import StringIO
  29. from pybloom import BloomFilter
  30. from bloom_indexer import (parse_arguments, create_index, MissingArgument,
  31. InvalidArgument)
  32. TEST_FILE_CONTENT = (
  33. "FieldA,FieldB,FieldC\n"
  34. "apple,carrot,example.domain.com\n"
  35. "banana,potato,www.google.co.uk\n"
  36. "orange,leek,subdomain.yahoo.com\n"
  37. "pear,cauliflower,\n"
  38. "pineapple,bean,\n"
  39. ",broccoli,\n")
  40. class TopLevelTest(unittest.TestCase):
  41. def setUp(self):
  42. self.test_file = StringIO(TEST_FILE_CONTENT)
  43. def tearDown(self):
  44. for tmpfile in glob.glob('/tmp/fake.csv.*.bfindex'):
  45. os.unlink(tmpfile)
  46. def test_insert_then_test(self):
  47. result = create_index(
  48. '/tmp/fake.csv', # input filename
  49. self.test_file, # file-like object
  50. 0.0001, # error rate
  51. 1, # skip lines
  52. [1, 2], # fields
  53. ',', # delimiter
  54. False) # recursive domain
  55. self.assertEqual(
  56. {'/tmp/fake.csv.2.bfindex': 6,
  57. '/tmp/fake.csv.1.bfindex': 5},
  58. result)
  59. b1 = BloomFilter.fromfile(open('/tmp/fake.csv.1.bfindex', 'rb'))
  60. b2 = BloomFilter.fromfile(open('/tmp/fake.csv.2.bfindex', 'rb'))
  61. self.assertEqual(False, 'FieldA' in b1)
  62. self.assertEqual(False, 'FieldB' in b2)
  63. for word in ('apple', 'banana', 'orange', 'pear', 'pineapple'):
  64. self.assertEqual(True, word in b1)
  65. self.assertEqual(False, word in b2)
  66. for word in ('carrot', 'potato', 'leek', 'cauliflower', 'bean'):
  67. self.assertEqual(True, word in b2)
  68. self.assertEqual(False, word in b1)
  69. def test_recursive_domains(self):
  70. result = create_index(
  71. '/tmp/fake.csv', # input filename
  72. self.test_file, # file-like object
  73. 0.0001, # error rate
  74. 1, # skip lines
  75. [3], # fields
  76. ',', # delimiter
  77. True) # recursive domain
  78. self.assertEqual(
  79. {'/tmp/fake.csv.3.bfindex': 9},
  80. result)
  81. b = BloomFilter.fromfile(open('/tmp/fake.csv.3.bfindex', 'rb'))
  82. for word in ('subdomain.yahoo.com', 'yahoo.com', 'com',
  83. 'example.domain.com', 'domain.com', 'www.google.co.uk',
  84. 'google.co.uk', 'co.uk', 'uk'):
  85. self.assertEqual(True, word in b)
  86. def test_higher_field_than_column_count(self):
  87. self.assertRaises(
  88. InvalidArgument,
  89. lambda: create_index(
  90. '/tmp/fake.csv', # input filename
  91. self.test_file, # file-like object
  92. 0.0001, # error rate
  93. 1, # skip lines
  94. [4], # fields
  95. ',', # delimiter
  96. False)) # recursive domain
  97. class ParseArgumentsTest(unittest.TestCase):
  98. def test_long_version(self):
  99. config = parse_arguments([
  100. 'fake.py',
  101. '--infile=/etc/profile',
  102. '--fields=2,6',
  103. '--skip-lines=3',
  104. '--false-positive-rate=0.00123',
  105. '--delimiter=,',
  106. '--index-domains-recursively'])
  107. self.assertEqual(
  108. {'delimiter': ',',
  109. 'false-positive-rate': 0.00123,
  110. 'fields': [2, 6],
  111. 'index-domains-recursively': True,
  112. 'infile': '/etc/profile',
  113. 'skip-lines': 3},
  114. config)
  115. def test_short_version(self):
  116. config = parse_arguments([
  117. 'fake.py',
  118. '-i/etc/profile',
  119. '-f2,6',
  120. '-s3',
  121. '-e0.00123',
  122. '-d,',
  123. '-r'])
  124. self.assertEqual(
  125. {'delimiter': ',',
  126. 'false-positive-rate': 0.00123,
  127. 'fields': [2, 6],
  128. 'index-domains-recursively': True,
  129. 'infile': '/etc/profile',
  130. 'skip-lines': 3},
  131. config)
  132. def test_missing_infile(self):
  133. self.assertRaises(
  134. MissingArgument,
  135. lambda: parse_arguments([
  136. 'fake.py',
  137. '--fields=2,6',
  138. '--skip-lines=3',
  139. '--false-positive-rate=0.00123',
  140. '--delimiter=,',
  141. '--index-domains-recursively']))
  142. def test_defaults(self):
  143. config = parse_arguments([
  144. 'fake.py',
  145. '--infile=/etc/profile'])
  146. self.assertEqual(
  147. {'delimiter': ';',
  148. 'false-positive-rate': 1e-05,
  149. 'fields': [], # meaning all
  150. 'index-domains-recursively': False,
  151. 'infile': '/etc/profile',
  152. 'skip-lines': 1},
  153. config)
  154. if __name__ == '__main__':
  155. import doctest
  156. import bloom_indexer
  157. if doctest.testmod(bloom_indexer).failed > 0:
  158. import sys
  159. sys.exit(1)
  160. unittest.main()