PageRenderTime 24ms CodeModel.GetById 1ms RepoModel.GetById 0ms app.codeStats 0ms

/Source Code/Indexer/Indexer.vb

#
Visual Basic | 531 lines | 315 code | 112 blank | 104 comment | 0 complexity | 8d6cbd1cbdc784a882c4c892a307ad9c MD5 | raw file
  1. Imports System.IO
  2. Public Class Indexer
  3. Private Files As New Dictionary(Of Integer, String) ' Key = File index, Value = File name
  4. Private FileNumbers As New Dictionary(Of String, Integer) ' Key = File name, Value = File index
  5. Private LastAddedFileIndex As Integer = 0
  6. Private Words As New Dictionary(Of Integer, String) ' Key = Word index, Value = Word
  7. Private WordNumbers As New Dictionary(Of String, Integer) ' Key = Word, Value = Word index
  8. Private LastAddedWordIndex As Integer = 0
  9. Private Index As New Dictionary(Of Integer, WordOccurrences) ' Key = Word index
  10. ''' <summary>
  11. ''' Returns the complete list of document numbers.
  12. ''' </summary>
  13. ReadOnly Property AllDocumentsNumbers() As Integer()
  14. Get
  15. Dim Result(FileNumbers.Count - 1) As Integer
  16. Dim I As Integer = 0
  17. For Each Document As KeyValuePair(Of Integer, String) In Files
  18. Result(I) = Document.Key
  19. I += 1
  20. Next
  21. Return Result
  22. End Get
  23. End Property
  24. ''' <summary>
  25. ''' Returns the total number of documents indexed.
  26. ''' </summary>
  27. ReadOnly Property DocumentCount() As Integer
  28. Get
  29. Return Files.Count
  30. End Get
  31. End Property
  32. ''' <summary>
  33. ''' Returns the name of the document, given its index.
  34. ''' </summary>
  35. ReadOnly Property DocumentName(ByVal index As Integer) As String
  36. Get
  37. If Files.ContainsKey(index) Then
  38. Return Files(index)
  39. Else
  40. Throw New ApplicationException("Document " + index.ToString + " does not exist in the index.")
  41. End If
  42. End Get
  43. End Property
  44. ''' <summary>
  45. ''' Returns the numeric index of the document, given its name.
  46. ''' </summary>
  47. ReadOnly Property DocumentNumber(ByVal name As String) As Integer
  48. Get
  49. If FileNumbers.ContainsKey(name) Then
  50. Return FileNumbers(name)
  51. Else
  52. Throw New ApplicationException("The document '" + name + "' does not exist in the index.")
  53. End If
  54. End Get
  55. End Property
  56. ''' <summary>
  57. ''' Returns the document indices of documents containing the specified words.
  58. ''' </summary>
  59. ''' <param name="word">The word to search for.</param>
  60. Function DocumentsContainingWord(ByVal word As String) As List(Of Integer)
  61. Dim Result As New List(Of Integer)
  62. Dim WordIndex As Integer = -1
  63. If WordNumbers.TryGetValue(word.ToUpper, WordIndex) Then
  64. Dim Occurrences As WordOccurrences = Nothing
  65. Dim Found As Boolean = Index.TryGetValue(WordIndex, Occurrences)
  66. If Not Found Then
  67. Throw New Exception("The word " + word + " exists in the words collection but not the index itself.")
  68. Else
  69. For Each DocumentIndex As Integer In Index(WordIndex).Documents.Keys
  70. Result.Add(DocumentIndex)
  71. Next
  72. End If
  73. End If
  74. Return Result
  75. End Function
  76. ''' <summary>
  77. ''' Returns the number of times the given word occurs in the given document.
  78. ''' </summary>
  79. ''' <param name="documentIndex">The document's index to search in.</param>
  80. ''' <param name="word">The word to search for.</param>
  81. Function TermFrequency(ByVal documentIndex As Integer, ByVal word As String) As Integer
  82. Dim Result As Integer = 0
  83. Dim WordIndex As Integer = -1
  84. Dim WordFound As Boolean = WordNumbers.TryGetValue(word.ToUpper, WordIndex)
  85. If WordFound Then
  86. ' Get the document frequency and return it
  87. Dim Occurrences As WordOccurrences = Index(WordIndex)
  88. Occurrences.Documents.TryGetValue(documentIndex, Result)
  89. End If
  90. Return Result
  91. End Function
  92. ''' <summary>
  93. ''' Empties the index.
  94. ''' </summary>
  95. Public Sub Clear()
  96. Files.Clear()
  97. FileNumbers.Clear()
  98. Words.Clear()
  99. WordNumbers.Clear()
  100. LastAddedWordIndex = 0
  101. LastAddedFileIndex = 0
  102. Index.Clear()
  103. End Sub
  104. ''' <summary>
  105. ''' Returns the unique words in the given file's contents.
  106. ''' </summary>
  107. ''' <param name="fileContents">The textual contents of a file.</param>
  108. ''' <returns>A list of the unique words in the given text.</returns>
  109. Private Function GetVocabulary(ByVal fileContents As String) As String()
  110. fileContents = fileContents.ToUpper
  111. Dim WordsInFile As String() = SplitIntoFixedWords(fileContents)
  112. Dim Vocabulary As New SortedList(Of String, Integer)
  113. For Each Word As String In WordsInFile
  114. If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
  115. Next
  116. Dim Result(Vocabulary.Count - 1) As String
  117. Vocabulary.Keys.CopyTo(Result, 0)
  118. Return Result
  119. End Function
  120. ''' <summary>
  121. ''' Returns the number of times each word occurs in the given document.
  122. ''' </summary>
  123. ''' <param name="fileContents">The document containing words to count.</param>
  124. Private Function GetWordCounts(ByVal fileContents As String) As Dictionary(Of Integer, Integer)
  125. ' Create a list of all the words, each set as occurring 0 times
  126. Dim WordCounts As New Dictionary(Of Integer, Integer)
  127. For Each Index As Integer In Words.Keys
  128. WordCounts.Add(Index, 0)
  129. Next
  130. ' Count the number of occurrances of every word in this document
  131. Dim WordsInFile As String() = SplitIntoFixedWords(fileContents)
  132. For Each Word As String In WordsInFile
  133. Dim WordIndex As Integer = WordNumbers(Word)
  134. WordCounts(WordIndex) += 1
  135. Next
  136. ' Determine which words have a count greater than zero and include only those in the result
  137. Dim WordsToInclude As New List(Of Integer)
  138. For Each WordIndex As Integer In WordCounts.Keys
  139. If WordCounts(WordIndex) > 0 Then WordsToInclude.Add(WordIndex)
  140. Next
  141. Dim Result As New Dictionary(Of Integer, Integer)
  142. For Each WordIndex As Integer In WordsToInclude
  143. Result.Add(WordIndex, WordCounts(WordIndex))
  144. Next
  145. Return Result
  146. End Function
  147. ''' <summary>
  148. ''' Splits the document into words.
  149. ''' </summary>
  150. ''' <param name="fileContents">The document to split into words.</param>
  151. ''' <returns>The document, split into words.</returns>
  152. Private Function SplitIntoFixedWords(ByVal fileContents As String) As String()
  153. fileContents = fileContents.ToUpper
  154. Dim Words() As String = fileContents.Split(" "c, "!"c, "."c, ":"c, """"c, "-"c, Environment.NewLine)
  155. Dim WordsToReturn(Words.Length - 1) As String
  156. Dim Index As Integer = 0
  157. For Each Word As String In Words
  158. Word = FixWord(Word)
  159. If Word <> "" Then
  160. WordsToReturn(Index) = Word
  161. Index += 1
  162. End If
  163. Next
  164. ReDim Preserve WordsToReturn(Index - 1)
  165. Return WordsToReturn
  166. End Function
  167. ''' <summary>
  168. ''' Find each file in the given directory and index it.
  169. ''' </summary>
  170. ''' <param name="directory">The directory whose files to index.</param>
  171. ''' <remarks>Files are not recursed, and so only the files in the specified directory itself will be indexed.</remarks>
  172. Public Sub AddDirectoryToIndex(ByVal directory As String)
  173. ' Get the file names
  174. For Each Filename As String In IO.Directory.GetFiles(directory)
  175. AddFileName(Filename)
  176. Next
  177. ' Phase 1: Build a global vocabulary
  178. Dim Vocabulary As New SortedList(Of String, Integer)
  179. For Each File As KeyValuePair(Of Integer, String) In Files
  180. Dim Contents As String = GetFileContents(File.Value)
  181. ' Build a vocabulary for the file and add it to the complete list of words
  182. Dim FileVocabulary As String() = GetVocabulary(Contents)
  183. For Each Word As String In FileVocabulary
  184. If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
  185. Next
  186. Next
  187. ' Copy the complete list of words into the global vocabulary
  188. For Each Word As String In Vocabulary.Keys
  189. AddWordToVocabulary(Word)
  190. Next
  191. ' Phase 2: Determine how many times each word occurs in each document
  192. For Each File As KeyValuePair(Of Integer, String) In Files
  193. ' Count each word as it is read in
  194. Dim Contents As String = GetFileContents(File.Value)
  195. Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(Contents)
  196. ' Store number of times each word occurs in the document (by merging with the global index)
  197. MergeDocumentWordCount(FileNumbers(File.Value), DocumentWordCount)
  198. Next
  199. End Sub
  200. Public Sub AddFileToIndex(ByVal fileName As String)
  201. AddFileContentsToIndex(fileName, GetFileContents(fileName))
  202. End Sub
  203. Public Sub AddFileToIndex(ByVal virtualFilename As String, ByVal fileName As String)
  204. AddFileContentsToIndex(virtualFilename, GetFileContents(fileName))
  205. End Sub
  206. Public Sub AddFileContentsToIndex(ByVal virtualFilename As String, ByVal contents As String)
  207. ' Add to the list of files
  208. Dim FileIndex As Integer = AddFileName(virtualFilename)
  209. ' Build a vocabulary for the file and add it to the complete list of words
  210. Dim Vocabulary As New SortedList(Of String, Integer)
  211. For Each Word As String In GetVocabulary(contents)
  212. If Not Vocabulary.ContainsKey(Word) Then Vocabulary.Add(Word, 0)
  213. Next
  214. ' Copy the complete list of words into the global vocabulary
  215. For Each Word As String In Vocabulary.Keys
  216. If Not WordNumbers.ContainsKey(Word) Then AddWordToVocabulary(Word)
  217. Next
  218. ' Count each word as it is read in
  219. Dim DocumentWordCount As Dictionary(Of Integer, Integer) = GetWordCounts(contents)
  220. ' Store number of times each word occurs in the document (by merging with the global index)
  221. MergeDocumentWordCount(FileIndex, DocumentWordCount)
  222. End Sub
  223. Public Sub RemoveDocumentFromIndex(ByVal virtualFilename As String)
  224. ' Check the file exists
  225. If Not FileNumbers.ContainsKey(virtualFilename) Then
  226. Throw New ApplicationException("The document '" + virtualFilename + "' does not exist in the index.")
  227. End If
  228. ' Remove the file from the list of files
  229. Dim DocumentIndex As Integer = FileNumbers(virtualFilename)
  230. Files.Remove(DocumentIndex)
  231. FileNumbers.Remove(virtualFilename)
  232. ' Process each word in the index
  233. Dim WordsToRemove As New List(Of Integer)
  234. For Each Word As KeyValuePair(Of Integer, WordOccurrences) In Index
  235. Dim WordIndex As Integer = Word.Key
  236. Dim Occurrences As WordOccurrences = Word.Value
  237. ' Remove the word from the index
  238. If Occurrences.Documents.ContainsKey(DocumentIndex) Then
  239. ' Remove the document from the word
  240. Dim OccurrencesInDocument As Integer = Occurrences.Documents(DocumentIndex)
  241. Occurrences.Occurrences -= OccurrencesInDocument
  242. Occurrences.Documents.Remove(DocumentIndex)
  243. ' This word no longer exists in any document
  244. If Occurrences.Documents.Count = 0 Then WordsToRemove.Add(WordIndex)
  245. End If
  246. Next
  247. ' Remove WordOccurrences objects that are no longer needed, as well as unused words in vocabulary
  248. For Each WordIndex As Integer In WordsToRemove
  249. Index.Remove(WordIndex)
  250. WordNumbers.Remove(Words(WordIndex))
  251. Words.Remove(WordIndex)
  252. Next
  253. End Sub
  254. ''' <summary>
  255. ''' Loads all of the text in the given file, and return it.
  256. ''' </summary>
  257. ''' <param name="filename">The file whose contents to retrieve.</param>
  258. Private Function GetFileContents(ByVal filename As String) As String
  259. Dim File As New IO.StreamReader(filename)
  260. Dim Contents As String = File.ReadToEnd
  261. File.Close()
  262. File.Dispose()
  263. Return Contents
  264. End Function
  265. ''' <summary>
  266. ''' Strips the words of all non-alphanumeric characters.
  267. ''' </summary>
  268. ''' <param name="word">The word to 'fix'.</param>
  269. ''' <returns>The word without any symbol characters.</returns>
  270. Private Function FixWord(ByVal word As String) As String
  271. Dim Result As String = ""
  272. For Each C As Char In word
  273. Select Case C
  274. Case "a"c To "z"c, "A"c To "Z"c, "0"c To "9"c : Result += C
  275. End Select
  276. Next
  277. Return Result
  278. End Function
  279. ''' <summary>
  280. ''' Merges the document words count into the global index.
  281. ''' </summary>
  282. ''' <param name="documentFilenameIndex">The index of the document whose word counts to merge.</param>
  283. ''' <param name="documentWordCount">A list of word indices and word counts.</param>
  284. Private Sub MergeDocumentWordCount(ByVal documentFilenameIndex As Integer, _
  285. ByVal documentWordCount As Dictionary(Of Integer, Integer))
  286. For Each WordIndex As Integer In documentWordCount.Keys
  287. Dim WordOccurrences As Integer = documentWordCount(WordIndex)
  288. If Index.ContainsKey(WordIndex) Then
  289. With Index(WordIndex)
  290. .Occurrences += WordOccurrences
  291. .Documents.Add(documentFilenameIndex, WordOccurrences)
  292. End With
  293. Else
  294. Index.Add(WordIndex, New WordOccurrences(WordOccurrences, documentFilenameIndex))
  295. End If
  296. Next
  297. End Sub
  298. Private Function AddWordToVocabulary(ByVal word As String) As Integer
  299. ' Determine the next word index to use
  300. Dim WordIndex As Integer = LastAddedWordIndex
  301. While Words.ContainsKey(WordIndex)
  302. WordIndex += 1
  303. End While
  304. LastAddedWordIndex = WordIndex
  305. ' Add the word to the vocabulary
  306. Words.Add(WordIndex, word)
  307. WordNumbers.Add(word, WordIndex)
  308. ' Return the index
  309. Return WordIndex
  310. End Function
  311. Private Function AddFileName(ByVal fileName As String) As Integer
  312. ' Determine the next file name index to use
  313. Dim FileNameIndex As Integer = LastAddedFileIndex
  314. While Files.ContainsKey(FileNameIndex)
  315. FileNameIndex += 1
  316. End While
  317. LastAddedFileIndex = FileNameIndex
  318. ' Add the word to the vocabulary
  319. Files.Add(FileNameIndex, fileName)
  320. FileNumbers.Add(fileName, FileNameIndex)
  321. ' Return the index
  322. Return FileNameIndex
  323. End Function
  324. Public Shared Function LoadIndex(ByVal fileName As String) As Indexer
  325. Dim Result As New Indexer
  326. ' Get list of files
  327. Using File As New BinaryReader(New FileStream(fileName, FileMode.Open))
  328. Dim FileCount As Integer = File.ReadInt32
  329. Result.Files.Clear()
  330. Result.FileNumbers.Clear()
  331. For I As Integer = 1 To FileCount
  332. Dim DocumentIndex As Integer = File.ReadInt32
  333. Dim DocumentName As String = File.ReadString
  334. Result.Files.Add(DocumentIndex, DocumentName)
  335. Result.FileNumbers.Add(DocumentName, DocumentIndex)
  336. Next
  337. ' Get global vocabulary
  338. Dim VocabularyWordCount As Integer = File.ReadInt32
  339. Result.Words.Clear()
  340. Result.WordNumbers.Clear()
  341. For I As Integer = 1 To VocabularyWordCount
  342. Dim WordIndex As Integer = File.ReadInt32
  343. Dim Word As String = File.ReadString
  344. Result.Words.Add(WordIndex, Word)
  345. Result.WordNumbers.Add(Word, WordIndex)
  346. Next
  347. ' Get the main part of the index
  348. Dim IndexEntriesCount As Integer = File.ReadInt32
  349. Result.Index.Clear()
  350. For A As Integer = 1 To IndexEntriesCount
  351. Dim MainKey As Integer = File.ReadInt32
  352. Dim Occurrences As Integer = File.ReadInt32
  353. Dim DocumentCount As Integer = File.ReadInt32
  354. Dim Documents(DocumentCount - 1) As Integer
  355. For B As Integer = 0 To DocumentCount - 1
  356. Documents(B) = File.ReadInt32
  357. Next
  358. Result.Index.Add(MainKey, New WordOccurrences(Occurrences, Documents))
  359. Next
  360. ' Close the file
  361. File.Close()
  362. End Using
  363. Return Result
  364. End Function
  365. Public Sub Save(ByVal fileName As String)
  366. ' Write list of files
  367. Dim File As New BinaryWriter(New FileStream(fileName, FileMode.Create))
  368. File.Write(Files.Count)
  369. For Each IndexedFilename As KeyValuePair(Of Integer, String) In Files
  370. File.Write(IndexedFilename.Key)
  371. File.Write(IndexedFilename.Value)
  372. Next
  373. ' Write global vocabulary
  374. File.Write(Words.Count)
  375. For Each Word As KeyValuePair(Of Integer, String) In Words
  376. File.Write(Word.Key)
  377. File.Write(Word.Value)
  378. Next
  379. ' Write the main part of the index
  380. File.Write(Index.Count)
  381. For Each IndexEntry As KeyValuePair(Of Integer, WordOccurrences) In Index
  382. File.Write(IndexEntry.Key)
  383. File.Write(IndexEntry.Value.Occurrences)
  384. File.Write(IndexEntry.Value.Documents.Count)
  385. For Each DocumentEntry As KeyValuePair(Of Integer, Integer) In IndexEntry.Value.Documents
  386. File.Write(DocumentEntry.Key)
  387. Next
  388. Next
  389. ' Close the file
  390. File.Close()
  391. End Sub
  392. ''' <summary>
  393. ''' Returns a string describing the index in full.
  394. ''' </summary>
  395. ''' <remarks>
  396. ''' The index is returned in the following format:
  397. '''
  398. ''' WORD: TotalOccurrances {DocumentFilename|DocumentOccurrances,...}
  399. ''' </remarks>
  400. Overrides Function ToString() As String
  401. ' Write out every word...
  402. Dim Result As New Text.StringBuilder
  403. For Each WordIndex As Integer In Index.Keys
  404. With Index(WordIndex)
  405. Result.Append(Words(WordIndex)).Append(": ").Append(.Occurrences).Append(" {")
  406. ' ...and every occurrance of the word...
  407. Dim CommaRequired As Boolean = False
  408. For Each DocumentIndex As Integer In .Documents.Keys
  409. Dim DocumentFilename As String = Files(DocumentIndex)
  410. Dim DocumentOccurrances As Integer = .Documents(DocumentIndex)
  411. If CommaRequired Then Result.Append(",") Else CommaRequired = True
  412. Result.Append(DocumentFilename).Append("|").Append(DocumentOccurrances.ToString)
  413. Next
  414. End With
  415. Result.Append("}").Append(Environment.NewLine)
  416. Next
  417. Return Result.ToString
  418. End Function
  419. End Class